协同过滤pyspark.mllib.ALS算法代码(一个例子)

 

原理在上一个帖子里https://blog.csdn.net/a8131357leo/article/details/100625257

训练模型

from pyspark.mllib.recommendation import ALS
from pyspark.conf import SparkConf
from pyspark.context import SparkContext


#设置日志
def SetLogger(sc):
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
    logger.LogManager.getLogger('akka').setLevel(logger.Level.ERROR)
    logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)

#设置路径   
def SetPath(sc):
    
    global Path

    if sc.master[0:5]=='local':
        Path = 'file:/home/hadoop/workspace/pythonProject/'
    else:
        Path = 'hdfs://master:9000/user/ang/'

#生成SPARK_CONTEXT
def CreateSparkContext(name):
    sparkconf = SparkConf()\
                    .setAppName(name)\
                    .set("spark.ui.showConsoleProgress","false")
    sc = SparkContext(conf = sparkconf)
    print("master="+sc.master)
    SetLogger(sc)
    SetPath(sc)
    return sc



def SaveModel(sc):
    try:
        model.save(sc,Path+"ALSmodel1")
        print('----saving ALSmodel completed---')
    except Exception:
        print("----ALS model already exists, delete first----")

"""
SPARK 2.4之前,ALS训练需要 RatingsRDD 数据格式,在2.4下,只要保持(user,product,rating) 格式就可以,不需要单独transform 呈RatingsRDD

"""

def PrepareData(sc):
    rawUserData = sc.textFile(Path+"data/ml-20m/ratings.csv")
    header = rawUserData.first()
    rawUserData = rawUserData.filter(lambda x: x != header)
    rawRatings = rawUserData.map(lambda line: tuple(line.split(",")[:3]))
    
    return rawRatings
    
    

if __name__=="__main__":
    sc = CreateSparkContext('move_recommendation')
    print("===================Preparing Data==================")
    
    ratingsRDD = PrepareData(sc)
    print("===================Training Data===================")
    print()
    print("ALS training, parameter: rank=5,iteration=5,lambda=0.1")
    print()
    model = ALS.train(ratingsRDD, 5, 5, 0.1)
    print("==================Save Model========================")
    SaveModel(sc)
    
    

 

使用生成好的模型进行推荐

# 使用生成的ALS模型---进行推荐

import sys
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.mllib.recommendation import MatrixFactorizationModel


def SetLogger(sc):

"""
关闭没有用的信息,要不然输出太多信息找不着
"""

    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
    logger.LogManager.getLogger('akka').setLevel(logger.Level.ERROR)
    logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)

# 设置文件的路径    
def SetPath(sc):
    
    global Path
    
    #如果实在本地,就选择本地路径
    if sc.master[0:5]=='local':
        Path = 'file:/home/hadoop/workspace/pythonProject/'
    else:

        ##设置HDFS路径
        Path = 'hdfs://master:9000/user/ang/'

def CreateSparkContext(name):
    sparkconf = SparkConf()\
                    .setAppName(name)\
                    .set("spark.ui.showConsoleProgress","false")
                    
    sc = SparkContext(conf = sparkconf)
    print("master="+sc.master)
    
    #关闭信息
    SetLogger(sc)

    #设置路径
    SetPath(sc)
    return sc

def PrepareData(sc):
    print('--------start reading MOVIE data and generate dictionary------')
    itemRDD = sc.textFile(Path+"data/ml-20m/movies.csv" )

    #剔除首行的表头
    head = itemRDD.first()
    itemRDD = itemRDD.filter(lambda x: x != head)

    #生成字典方便查找
    #是否生成字典,取决你的dictionary的大小,以及是否要重复使用数据
    movieTitle = itemRDD.map(lambda line:line.split(",")).map(lambda a:(float(a[0]),a[1])).collectAsMap()
    
    return movieTitle

#加载刚才生成好的ALS模型
def loadModel(sc):
    try:
        model = MatrixFactorizationModel.load(sc,Path+"ALSmodel1")
        print("-----load ALSmodel-----")
    except Exception:
        print("*****can't not find model******")
    return model
    
def Recommend(model):
    if sys.argv[1] == "--U":
        RecommendMovies(model,movieTitle,int(sys.argv[2]))
    if sys.argv[1] == "--M":
        RecommendUsers(model,movieTitle,int(sys.argv[2]))
        
#提供USER ID 为他推荐10个电影
def RecommendMovies(model,movieTitle,inputUserID):
    Rec_movie = model.recommendProducts(inputUserID, 10)
    print('USER: '+str(inputUserID)+' Recommend Movies:')
    print()
    for r in Rec_movie:
        print( "For USER_ID:{0}, recommend:{1}, rec_rating:{2} ".format(r[0],movieTitle[r[1]],r[2]))

#提供MOVIE ID,提供是个应该推荐的USER
def RecommendUsers(model,movieTitle,inputMovieID):
    Rec_user = model.recommendUsers(inputMovieID, 10)
    print('MOVIE: '+str(movieTitle[inputMovieID]+" Recommend to following USERS"))
    for r in Rec_user:
        print( "For USER_ID:{0}, rec_rating:{1} ".format(r[0],r[2]))
        


if __name__=='__main__':
    
    # 程序上传应该一共有三个参数,程序 *.py, 提供movie还是user, ID
    if len(sys.argv)!=3:
        print('****please enter 2 parameters******')
        exit(-1)
    sc = CreateSparkContext('recommendation')
    print("===================== preparing Data===================")
    movieTitle = PrepareData(sc)
    print('[   data prepared  ]')
    print("===================== load model ======================")
    model = loadModel(sc)
    print("[   model loaded   ] ")
    print('===================== make recommendation =============')
    Recommend(model)

 

mllib库没有参数优化的方法,选参数lambda, iteration,什么的贼麻烦, 如果不是想不开别用pyspark.mllib

你可能感兴趣的:(PySpark学习日志)