pyspark运行ALS推荐算法

首先创建读取路径,这个取决于你的pyspark是以什么方式启动的

global Path
if sc.master[0:5] == 'local':
    Path='file:/home/swt/pythonwork/PythonProject/'
else:
    Path="hdfs://ubuntu:9000/user/swt/"

我是本地启动

sc.master
'local[*]'

开始读取数据

rawUserData = sc.textFile(Path+"data/u.data")
rawUserData.count()
rawUserData.first()
rawRatings = rawUserData.map(lambda line:line.split("\t")[:3])
rawRatings.take(5)
ratingsRDD = rawRatings.map(lambda x:(x[0],x[1],x[2]))
ratingsRDD.take(5)
100000
'196\t242\t3\t881250949'
[['196', '242', '3'],
 ['186', '302', '3'],
 ['22', '377', '1'],
 ['244', '51', '2'],
 ['166', '346', '1']]
[('196', '242', '3'),
 ('186', '302', '3'),
 ('22', '377', '1'),
 ('244', '51', '2'),
 ('166', '346', '1')]
# 对数据进行处理
numRatings=ratingsRDD.count()
numRatings
numUsers = ratingsRDD.map(lambda x:x[0]).distinct().count()
numUsers
numMovles = ratingsRDD.map(lambda x:x[1]).distinct().count()
numMovles
100000
943
1682
# 将处理好的数据进行训练,ALS算法返回的是一个model
from pyspark.mllib.recommendation import ALS
model = ALS.train(ratingsRDD, 10, 10, 0.01)
print(model)
<pyspark.mllib.recommendation.MatrixFactorizationModel object at 0x7fb87179b908>

# 用户id为100的用户推荐以下电影和推荐指数
model.recommendProducts(100,5)
[Rating(user=100, product=1160, rating=7.146604192513291),
 Rating(user=100, product=1631, rating=6.491199349189874),
 Rating(user=100, product=253, rating=6.4363095037441145),
 Rating(user=100, product=904, rating=6.310990410281013),
 Rating(user=100, product=1097, rating=5.990350261674924)]
# 电影id为200的推荐给以下用户和推荐指数
model.recommendUsers(200,5)
[Rating(user=762, product=200, rating=6.990901173944179),
 Rating(user=55, product=200, rating=6.828390321151252),
 Rating(user=581, product=200, rating=6.131202021531233),
 Rating(user=888, product=200, rating=6.026787459201902),
 Rating(user=818, product=200, rating=5.947695724505568)]
# 读取电影名
itemRDD = sc.textFile(Path+"data/u.item")
itemRDD.count()
moiveTitle = itemRDD.map(lambda line:line.split("|")).map(lambda a:(float(a[0]),a[1])).collectAsMap()
len(moiveTitle)
list(moiveTitle.items())[:5]
1682
1682
[(1.0, 'Toy Story (1995)'),
 (2.0, 'GoldenEye (1995)'),
 (3.0, 'Four Rooms (1995)'),
 (4.0, 'Get Shorty (1995)'),
 (5.0, 'Copycat (1995)')]
recommendP = model.recommendProducts(100,5)
for p in recommendP:
    print("对用户"+str(p[0])+" 推荐电影 "+str(moiveTitle[p[1]])+" 推荐评分为: "+str(p[2]))
recommendP = model.recommendProducts(100,5)

for p in recommendP:

    print("对用户"+str(p[0])+" 推荐电影 "+str(moiveTitle[p[1]])+" 推荐评分为: "+str(p[2]))

对用户100 推荐电影 Love! Valour! Compassion! (1997) 推荐评分为: 7.146604192513291
对用户100 推荐电影 Slingshot, The (1993) 推荐评分为: 6.491199349189874
对用户100 推荐电影 Pillow Book, The (1995) 推荐评分为: 6.4363095037441145
对用户100 推荐电影 Ma vie en rose (My Life in Pink) (1997) 推荐评分为: 6.310990410281013
对用户100 推荐电影 Hate (Haine, La) (1995) 推荐评分为: 5.990350261674924

model.save(sc,Path+'ALSmodel')

在这里保存模型的时候报了错误,我现在还不知道是啥原因

Py4JJavaError                             Traceback (most recent call last)
<ipython-input-22-24dc419c2741> in <module>()
----> 1 model.save(sc,Path+'ALSmodel')

/usr/local/spark/python/pyspark/mllib/util.py in save(self, sc, path)
    404         if not isinstance(path, basestring):
    405             raise TypeError("path should be a basestring, got type %s" % type(path))
--> 406         self._java_model.save(sc._jsc.sc(), path)
    407 
    408 

/usr/local/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1131         answer = self.gateway_client.send_command(command)
   1132         return_value = get_return_value(
-> 1133             answer, self.gateway_client, self.target_id, self.name)
   1134 
   1135         for temp_arg in temp_args:

/usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
     61     def deco(*a, **kw):
     62         try:
---> 63             return f(*a, **kw)
     64         except py4j.protocol.Py4JJavaError as e:
     65             s = e.java_exception.toString()

/usr/local/spark/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    317                 raise Py4JJavaError(
    318                     "An error occurred while calling {0}{1}{2}.\n".
--> 319                     format(target_id, ".", name), value)
    320             else:
    321                 raise Py4JError(

Py4JJavaError: An error occurred while calling o138.save.
: org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory file:/home/swt/pythonwork/PythonProject/ALSmodel/metadata already exists
	at org.apache.hadoop.mapred.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:131)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1191)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1168)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1168)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1168)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:1071)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1037)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:1037)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1037)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:963)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:963)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:963)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:962)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1489)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1468)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1468)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1468)
	at org.apache.spark.mllib.recommendation.MatrixFactorizationModel$SaveLoadV1_0$.save(MatrixFactorizationModel.scala:361)
	at org.apache.spark.mllib.recommendation.MatrixFactorizationModel.save(MatrixFactorizationModel.scala:206)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)

你可能感兴趣的:(虚拟机+大数据,pyspark,ALS)