皮尔逊相似度
import math
def read2_data(): #针对已经分好的数据集
trainSet = {}
testSet = {}
f_train = open('u1.base')
#for line in f.readlines()[1:]:
for line in f_train.readlines():
user, movie, rating, timestamp = line.split('\t')
trainSet.setdefault(user, {}) # 如果键不存在于字典中,将会添加键并将值设为默认值。
trainSet[user][movie] = rating #两层字典的嵌套
#读测试集
f_test = open('u1.test')
for line in f_test.readlines():
user, movie, rating, timestamp = line.split('\t') #都是str类型
testSet.setdefault(user, {}) # 如果键不存在于字典中,将会添加键并将值设为默认值。
testSet[user][movie] = rating # 两层字典的嵌套
return trainSet,testSet
def read_data(filename,pivot):#针对完整的未进行划分的数据集
trainSet = {}
testSet = {}
trainSet_len = 0
testSet_len = 0
for line in self.load_file(filename):
user, movie, rating, timestamp = line.split(',') #\t或者空格或者其他
if random.random() < pivot:
trainSet.setdefault(user, {})
trainSet[user][movie] = rating
trainSet_len += 1
else:
testSet.setdefault(user, {})
testSet[user][movie] = rating
testSet_len += 1
print('已划分')
print('训练集长度 = %s' % trainSet_len)
print('测试集长度 = %s' % testSet_len)
return trainSet,testSet
#计算皮尔逊相似度
def simi_pearson(a,b):
trainSet = read2_data()[0] # 训练集
users = trainSet.keys() # 所有的用户 非列表类型
num_users = len(users) #用户总数
#for user in users:
# print(user)
# 应该不会出现在训练集中没有,在测试集该用户却出现的情况
a_data = trainSet[a]
b_data = trainSet[b]
# print(a_data) 字典
#print(type(a_data))
#print(a_data.values())
a_rating_sum = sum(float(a_data[i]) for i in a_data) # 计算字典的值的和 即评分之和
#print(a_rating_sum)
a_rating_num = len(a_data)
#print(a_rating_num)
b_rating_sum = sum(float(b_data[i]) for i in b_data)
b_rating_num = len(b_data)
a_average_rating = a_rating_sum / a_rating_num # a的均值
b_average_rating = b_rating_sum / b_rating_num # b的均值
x = 0
y = 0
for i in a_data.keys():#时间复杂度较高 n^2
if i in b_data.keys():
x += (float(a_data[i])-a_average_rating) * (float(b_data[i]) - b_average_rating)
#x 可能是0 因为两个人可能没有观看相同的电影
y = math.sqrt(sum(pow(float(a_data[i])-a_average_rating,2) for i in a_data)) * \
math.sqrt(sum(pow(float(b_data[i])-b_average_rating,2) for i in b_data))
if x:
return x / y
else:
return 0
#通过余弦相似度计算a,b的相似度
def simi_cos(a,b):
trainSet = read2_data()[0] # 训练集
users = trainSet.keys() # 所有的用户 非列表类型
num_users = len(users) # 用户总数
# for user in users:
# print(user)
# 应该不会出现在训练集中没有,在测试集该用户却出现的情况
a_data = trainSet[a]
b_data = trainSet[b]
# print(a_data) 字典
# print(type(a_data))
# print(a_data.values())
a_rating_sum = sum(float(a_data[i]) for i in a_data) # 计算字典的值的和 即评分之和
# print(a_rating_sum)
a_rating_num = len(a_data)
# print(a_rating_num)
b_rating_sum = sum(float(b_data[i]) for i in b_data)
b_rating_num = len(b_data)
a_average_rating = a_rating_sum / a_rating_num # a的均值
b_average_rating = b_rating_sum / b_rating_num # b的均值
x = 0
y = 0
for i in a_data.keys(): # 时间复杂度较高 n^2
if i in b_data.keys():
x += (float(a_data[i]) ) * (float(b_data[i]) )
# x 可能是0 因为两个人可能没有观看相同的电影
y = math.sqrt(sum(pow(float(a_data[i]) , 2) for i in a_data)) * \
math.sqrt(sum(pow(float(b_data[i]) , 2) for i in b_data))
if x:
return x / y
else:
return 0
# 预估评分并求出RMSE
def error(k):
list_MSE = []
list_RMSE = []
trainSet, testSet = read2_data() #读出训练集与测试集
users = trainSet.keys() # 所有的用户 非列表类型
num_users = len(users) # 用户总数
movies = [] # 所有电影的列表
for u in trainSet.keys():
data = trainSet[u]
for g in data:
if g not in movies:
movies.append(g)
for u in testSet.keys():
data = testSet[u]
for g in data:
if g not in movies:
movies.append(g)
#sorted(movies)
#print(movies)
#print(len(movies))
for user in users: #对user缺少的项进行预测
E = 0
cnt = 0
MSE = 0
RMSE=0
user_data = trainSet[user]
simi_dict = {}
for other_user in users:
if user != other_user:
simi_dict[other_user] = simi_pearson(user, other_user)
sorted_simi_dict2list = sorted(simi_dict.items(), key=lambda x: x[1], reverse=True)[:k] # 按照推荐度从高到低进行排序 返回的是列表
print(sorted_simi_dict2list)
sum_sim = 0
for u_s in sorted_simi_dict2list:
sum_sim += u_s[1] # 相似度之和
user_rating_sum = sum(float(user_data[i]) for i in user_data) # 计算字典的值的和 即评分之和
user_rating_num = len(user_data)
for movie in movies:
pre_rating = user_rating_sum / user_rating_num # 平均值
temp = 0
if movie not in user_data.keys():
cnt += 1
for u_s in sorted_simi_dict2list:
if movie in trainSet[u_s[0]].keys():
temp += float(trainSet[u_s[0]][movie]) * u_s[1] #加权
pre_rating += temp / sum_sim
if user in testSet.keys():
if movie in testSet[user].keys():
MSE += abs(pre_rating-float(testSet[user][movie]))
RMSE += MSE**2
MSE /= cnt
RMSE /= cnt
RMSE = math.sqrt(RMSE)
list_MSE.append(MSE)
list_RMSE.append(RMSE)
print(MSE, RMSE)
return list_MSE, list_RMSE
def recommend(user):
pass
if __name__ == '__main__':
pear_MSE,pear_RMSE = error(10)
print('OK')
杰卡德相似度
import math
def read2_data(): # 针对已经分好的数据集
trainSet = {}
testSet = {}
f_train = open('u1.base')
# for line in f.readlines()[1:]:
for line in f_train.readlines():
user, movie, rating, timestamp = line.split('\t')
trainSet.setdefault(user, {}) # 如果键不存在于字典中,将会添加键并将值设为默认值。
trainSet[user][movie] = rating # 两层字典的嵌套
# 读测试集
f_test = open('u1.test')
for line in f_test.readlines():
user, movie, rating, timestamp = line.split('\t') # 都是str类型
testSet.setdefault(user, {}) # 如果键不存在于字典中,将会添加键并将值设为默认值。
testSet[user][movie] = rating # 两层字典的嵌套
return trainSet, testSet
def read_data(filename, pivot): # 针对完整的未进行划分的数据集
trainSet = {}
testSet = {}
trainSet_len = 0
testSet_len = 0
for line in self.load_file(filename):
user, movie, rating, timestamp = line.split(',') # \t或者空格或者其他
if random.random() < pivot:
trainSet.setdefault(user, {})
trainSet[user][movie] = rating
trainSet_len += 1
else:
testSet.setdefault(user, {})
testSet[user][movie] = rating
testSet_len += 1
print('已划分')
print('训练集长度 = %s' % trainSet_len)
print('测试集长度 = %s' % testSet_len)
return trainSet, testSet
def jacca(a,b):
ab = 0
trainSet = read2_data()[0]
a_data = trainSet[a]
b_data = trainSet[b]
for i in a_data.keys(): # 时间复杂度较高 n^2
if i in b_data.keys():
ab += 1
return ab/(len(a_data)+len(b_data)-ab)
# 预估评分并求出RMSE
def error(k):
list_MSE = []
list_RMSE = []
trainSet, testSet = read2_data() # 读出训练集与测试集
users = trainSet.keys() # 所有的用户 非列表类型
num_users = len(users) # 用户总数
print(num_users)
movies = [] # 所有电影的列表
for u in trainSet.keys():
data = trainSet[u]
for g in data:
if g not in movies:
movies.append(g)
for u in testSet.keys():
data = testSet[u]
for g in data:
if g not in movies:
movies.append(g)
#sorted(movies)
#print(movies)
#print(len(movies))
for user in users: # 对user缺少的项进行预测
E = 0
cnt = 0
MSE = 0
RMSE = 0
user_data = trainSet[user]
simi_dict = {}
for other_user in users:
if user != other_user:
simi_dict[other_user] = jacca(user, other_user)
sorted_simi_dict2list = sorted(simi_dict.items(), key=lambda x: x[1], reverse=True)[:k] # 按照推荐度从高到低进行排序 返回的是列表
print(sorted_simi_dict2list)
sum_sim = 0
for u_s in sorted_simi_dict2list:
sum_sim += u_s[1] # 相似度之和
user_rating_sum = sum(float(user_data[i]) for i in user_data) # 计算字典的值的和 即评分之和
user_rating_num = len(user_data)
for movie in movies:
pre_rating = user_rating_sum / user_rating_num # 平均值
temp = 0
if movie not in user_data.keys():
cnt += 1
for u_s in sorted_simi_dict2list:
if movie in trainSet[u_s[0]].keys():
temp += float(trainSet[u_s[0]][movie]) * u_s[1] # 加权
pre_rating += temp / sum_sim
if user in testSet.keys():
if movie in testSet[user].keys():
MSE += abs(pre_rating - float(testSet[user][movie]))
RMSE += MSE ** 2
MSE /= cnt
RMSE /= cnt
RMSE = math.sqrt(RMSE)
list_MSE.append(MSE)
list_RMSE.append(RMSE)
print(MSE, RMSE)
return list_MSE,list_RMSE
def recommend(user):
pass
if __name__ == '__main__':
jacca_MSE,jacca_RMSE = error(10)
print('OK')
余弦相似度
import math
def read2_data(): # 针对已经分好的数据集
trainSet = {}
testSet = {}
f_train = open('u1.base')
# for line in f.readlines()[1:]:
for line in f_train.readlines():
user, movie, rating, timestamp = line.split('\t')
trainSet.setdefault(user, {}) # 如果键不存在于字典中,将会添加键并将值设为默认值。
trainSet[user][movie] = rating # 两层字典的嵌套
# 读测试集
f_test = open('u1.test')
for line in f_test.readlines():
user, movie, rating, timestamp = line.split('\t') # 都是str类型
testSet.setdefault(user, {}) # 如果键不存在于字典中,将会添加键并将值设为默认值。
testSet[user][movie] = rating # 两层字典的嵌套
return trainSet, testSet
def read_data(filename, pivot): # 针对完整的未进行划分的数据集
trainSet = {}
testSet = {}
trainSet_len = 0
testSet_len = 0
for line in self.load_file(filename):
user, movie, rating, timestamp = line.split(',') # \t或者空格或者其他
if random.random() < pivot:
trainSet.setdefault(user, {})
trainSet[user][movie] = rating
trainSet_len += 1
else:
testSet.setdefault(user, {})
testSet[user][movie] = rating
testSet_len += 1
print('已划分')
print('训练集长度 = %s' % trainSet_len)
print('测试集长度 = %s' % testSet_len)
return trainSet, testSet
# 计算皮尔逊相似度
def simi_pearson(a, b):
trainSet = read2_data()[0] # 训练集
users = trainSet.keys() # 所有的用户 非列表类型
num_users = len(users) # 用户总数
# for user in users:
# print(user)
# 应该不会出现在训练集中没有,在测试集该用户却出现的情况
a_data = trainSet[a]
b_data = trainSet[b]
# print(a_data) 字典
# print(type(a_data))
# print(a_data.values())
a_rating_sum = sum(float(a_data[i]) for i in a_data) # 计算字典的值的和 即评分之和
# print(a_rating_sum)
a_rating_num = len(a_data)
# print(a_rating_num)
b_rating_sum = sum(float(b_data[i]) for i in b_data)
b_rating_num = len(b_data)
a_average_rating = a_rating_sum / a_rating_num # a的均值
b_average_rating = b_rating_sum / b_rating_num # b的均值
x = 0
y = 0
for i in a_data.keys(): # 时间复杂度较高 n^2
if i in b_data.keys():
x += (float(a_data[i]) - a_average_rating) * (float(b_data[i]) - b_average_rating)
# x 可能是0 因为两个人可能没有观看相同的电影
y = math.sqrt(sum(pow(float(a_data[i]) - a_average_rating, 2) for i in a_data)) * \
math.sqrt(sum(pow(float(b_data[i]) - b_average_rating, 2) for i in b_data))
if x:
return x / y
else:
return 0
def jacca(a,b):
ab = 0
a_data = trainSet[a]
b_data = trainSet[b]
for i in a_data.keys(): # 时间复杂度较高 n^2
if i in b_data.keys():
ab += 1
return ab/(len(a_data)+len(b_data)-ab)
# 通过余弦相似度计算a,b的相似度
def simi_cos(a, b):
trainSet = read2_data()[0] # 训练集
users = trainSet.keys() # 所有的用户 非列表类型
num_users = len(users) # 用户总数
# for user in users:
# print(user)
# 应该不会出现在训练集中没有,在测试集该用户却出现的情况
a_data = trainSet[a]
b_data = trainSet[b]
# print(a_data) 字典
# print(type(a_data))
# print(a_data.values())
a_rating_sum = sum(float(a_data[i]) for i in a_data) # 计算字典的值的和 即评分之和
# print(a_rating_sum)
a_rating_num = len(a_data)
# print(a_rating_num)
b_rating_sum = sum(float(b_data[i]) for i in b_data)
b_rating_num = len(b_data)
a_average_rating = a_rating_sum / a_rating_num # a的均值
b_average_rating = b_rating_sum / b_rating_num # b的均值
x = 0
y = 0
for i in a_data.keys(): # 时间复杂度较高 n^2
if i in b_data.keys():
x += (float(a_data[i])) * (float(b_data[i]))
# x 可能是0 因为两个人可能没有观看相同的电影
y = math.sqrt(sum(pow(float(a_data[i]), 2) for i in a_data)) * \
math.sqrt(sum(pow(float(b_data[i]), 2) for i in b_data))
if x:
return x / y
else:
return 0
# 预估评分并求出RMSE
def error(k):
list_MSE = []
list_RMSE = []
trainSet, testSet = read2_data() # 读出训练集与测试集
users = trainSet.keys() # 所有的用户 非列表类型
num_users = len(users) # 用户总数
movies = [] # 所有电影的列表
for u in trainSet.keys():
data = trainSet[u]
for g in data:
if g not in movies:
movies.append(g)
for u in testSet.keys():
data = testSet[u]
for g in data:
if g not in movies:
movies.append(g)
#sorted(movies)
#print(movies)
#print(len(movies))
for user in users: # 对user缺少的项进行预测
E = 0
cnt = 0
MSE = 0
RMSE = 0
user_data = trainSet[user]
simi_dict = {}
for other_user in users:
if user != other_user:
simi_dict[other_user] = simi_cos(user, other_user)
sorted_simi_dict2list = sorted(simi_dict.items(), key=lambda x: x[1], reverse=True)[:k] # 按照推荐度从高到低进行排序 返回的是列表
print(sorted_simi_dict2list)
sum_sim = 0
for u_s in sorted_simi_dict2list:
sum_sim += u_s[1] # 相似度之和
user_rating_sum = sum(float(user_data[i]) for i in user_data) # 计算字典的值的和 即评分之和
user_rating_num = len(user_data)
for movie in movies:
pre_rating = user_rating_sum / user_rating_num # 平均值
temp = 0
if movie not in user_data.keys():
cnt += 1
for u_s in sorted_simi_dict2list:
if movie in trainSet[u_s[0]].keys():
temp += float(trainSet[u_s[0]][movie]) * u_s[1] # 加权
pre_rating += temp / sum_sim
if user in testSet.keys():
if movie in testSet[user].keys():
MSE += abs(pre_rating - float(testSet[user][movie]))
RMSE += MSE ** 2
MSE /= cnt
RMSE /= cnt
RMSE = math.sqrt(RMSE)
list_MSE.append(MSE)
list_RMSE.append(RMSE)
print(MSE,RMSE)
return list_MSE,list_RMSE
def recommend(user):
pass
if __name__ == '__main__':
cos_MSE,cos_RMSE = error(10)
print('OK')