




言归正传,下面来说说针对百度这个比赛如何如何用SVD来实现推荐系统,为了了解基本原理可以看看这篇文章:推荐系统相关算法(1):SVD (后面提到的三篇论文也值得一读)



l  任务一

n training_set.txt用户评分数据共三列,从左到右依次为userId、movieIdrating即用户id电影id该用户对该电影的评分。列之间以’\t’分隔,行之间以’\r\n’分隔

n predict.txt为预测集合,共两列。从左到右依次是userId,movieId。即用户id,电影id.列之间以’\t’分隔,行之间以’\r\n’分隔。参赛者需要预测出第三列,即该用户对该电影的评分,作为第三列,并提交给评测平台。需要注意的是,参赛者最终提的predict.txt是三列,列之间以’\t’分隔,行之间以’\r\n’分隔。行之间的顺序不能乱,行的总数不能少。


7245481	962729	4.0

7245481	794171

userMap = {}
movieMap = {}

with open('training_set.txt') as fp:
    fp_user = open('usermap.txt', 'w')
    fp_movis = open('moviemap.txt', 'w') 
    fp_out = open('smallMatrix.txt', 'w')
    fp_prediction = open('test.txt', 'r')
    fp_out2 = open('smallPredictionMatrix.txt','w')

    for line in fp:
        line = line.strip()
        if line == '':
        tup = line.split()
        raw_user = tup[0]
        raw_movie = tup[1]
        rate = float(tup[2])
        if raw_user not in userMap:
            userMap[raw_user] = len(userMap.keys())
        user_id = userMap[raw_user]
        if raw_movie not in movieMap:
            movieMap[raw_movie] = len(movieMap.keys())
        movie_id = movieMap[raw_movie]
        fp_out.write('{0} {1} {2}\n'.format(user_id, movie_id, rate))

    for raw_user, user_id in userMap.items():
        fp_user.write('{0} {1}\n'.format(raw_user, user_id))

    for raw_movie, movie_id in movieMap.items():
        fp_movis.write('{0} {1}\n'.format(raw_movie, movie_id))

    for line2 in fp_prediction:
        line2 = line2.strip()
        if line2 == '':
        tup2 = line2.split()
        raw_user2 = tup2[0]
        raw_movie2 = tup2[1]
        user_id2 = userMap[raw_user2]
        movie_id2 = movieMap[raw_movie2]
        fp_out2.write('{0} {1}\n'.format(user_id2, movie_id2))



0 0 4.0
0 617
3.579231 10000 10000 10 0.01 0.05
averageScore userNum itemNum factorNum learnRate regularization 




import math
import random
import cPickle as pickle

#calculate the overall average
def Average(fileName):
	fi = open(fileName, 'r')
	result = 0.0
	cnt = 0
	for line in fi:
		cnt += 1
		arr = line.split()
		result += int(arr[2].strip())
	return result / cnt

def InerProduct(v1, v2):
	result = 0
	for i in range(len(v1)):
		result += v1[i] * v2[i]
	return result

def PredictScore(av, bu, bi, pu, qi):
	pScore = av + bu + bi + InerProduct(pu, qi)
	if pScore < 1:
		pScore = 1
	elif pScore > 5:
		pScore = 5
	return pScore

#def SVD(configureFile, testDataFile, trainDataFile, modelSaveFile):
def SVD(configureFile, trainDataFile, modelSaveFile):
	#get the configure
	fi = open(configureFile, 'r')
	line = fi.readline()
	arr = line.split()
	averageScore = float(arr[0].strip())
	userNum = int(arr[1].strip())
	itemNum = int(arr[2].strip())
	factorNum = int(arr[3].strip())
	learnRate = float(arr[4].strip())
	regularization = float(arr[5].strip())
	bi = [0.0 for i in range(itemNum)]
	bu = [0.0 for i in range(userNum)]
	temp = math.sqrt(factorNum)
	qi = [[(0.1 * random.random() / temp) for j in range(factorNum)] for i in range(itemNum)]	
	pu = [[(0.1 * random.random() / temp)  for j in range(factorNum)] for i in range(userNum)]
	print("initialization end\nstart training\n")
	#train model
	preRmse = 1000000.0
	for step in range(5):
		fi = open(trainDataFile, 'r')	
		for line in fi:
			arr = line.split()
			uid = int(arr[0].strip()) - 1
			iid = int(arr[1].strip()) - 1
			score = int(arr[2].strip())			
			prediction = PredictScore(averageScore, bu[uid], bi[iid], pu[uid], qi[iid])
			eui = score - prediction
			#update parameters
			bu[uid] += learnRate * (eui - regularization * bu[uid])
			bi[iid] += learnRate * (eui - regularization * bi[iid])	
			for k in range(factorNum):
				temp = pu[uid][k]	#attention here, must save the value of pu before updating
				pu[uid][k] += learnRate * (eui * qi[iid][k] - regularization * pu[uid][k])
				qi[iid][k] += learnRate * (eui * temp - regularization * qi[iid][k])
		#learnRate *= 0.9
		#curRmse = Validate(testDataFile, averageScore, bu, bi, pu, qi)
		#print("test_RMSE in step %d: %f" %(step, curRmse))
		#if curRmse >= preRmse:
		#	break
		#	preRmse = curRmse
	#write the model to files
	fo = file(modelSaveFile, 'wb')
	pickle.dump(bu, fo, True)
	pickle.dump(bi, fo, True)
	pickle.dump(qi, fo, True)
	pickle.dump(pu, fo, True)
	print("model generation over")
#validate the model
def Validate(testDataFile, av, bu, bi, pu, qi):
	cnt = 0
	rmse = 0.0
	fi = open(testDataFile, 'r')		
	for line in fi:
		cnt += 1
		arr = line.split()
		uid = int(arr[0].strip()) - 1
		iid = int(arr[1].strip()) - 1
		pScore = PredictScore(av, bu[uid], bi[iid], pu[uid], qi[iid])
		tScore = int(arr[2].strip())
		rmse += (tScore - pScore) * (tScore - pScore)
	return math.sqrt(rmse / cnt)


#use the model to make predict
def Predict(configureFile, modelSaveFile, testDataFile, resultSaveFile):
	#get parameter
	fi = open(configureFile, 'r')
	line = fi.readline()
	arr = line.split()
	averageScore = float(arr[0].strip())
	#get model
	fi = file(modelSaveFile, 'rb')
	bu = pickle.load(fi)
	bi = pickle.load(fi)
	qi = pickle.load(fi)
	pu = pickle.load(fi)
	fi = open(testDataFile, 'r')
	fo = open(resultSaveFile, 'w')
	for line in fi:
		arr = line.split()
		uid = int(arr[0].strip()) - 1
		iid = int(arr[1].strip()) - 1
		pScore = PredictScore(averageScore, bu[uid], bi[iid], pu[uid], qi[iid])
		fo.write("%f\n" %pScore)
	print("predict over")

if __name__ == '__main__':
	configureFile = 'svd.conf'
	trainDataFile = 'ml_data\\smallMatrix.txt'
	testDataFile = 'ml_data\\smallPredictionMatrix.txt'
	modelSaveFile = 'svd_model.pkl'
	resultSaveFile = 'prediction.txt'
	#print("%f" %Average("ua.base"))
	SVD(configureFile, trainDataFile, modelSaveFile)
	Predict(configureFile, modelSaveFile, testDataFile, resultSaveFile)



fp1 = open('predict.txt')
fp2 = open('prediction.txt')
fp_out = open('file3.txt', 'w')
for line1, line2 in zip(fp1, fp2):
    line1 = line1.strip()
    line2 = line2.strip()
    fp_out.write('{0}\t{1}\n'.format(line1, line2))


7245481	794171	3.879440
