基于用户的协同过滤算法【有训练集与测试集】

皮尔逊相似度

import math
def read2_data():    #针对已经分好的数据集
    trainSet = {}
    testSet = {}  
    f_train = open('u1.base')
    #for line in f.readlines()[1:]:
    for line in f_train.readlines():
        user, movie, rating, timestamp = line.split('\t')
        trainSet.setdefault(user, {}) # 如果键不存在于字典中,将会添加键并将值设为默认值。
        trainSet[user][movie] = rating #两层字典的嵌套
    #读测试集
    f_test = open('u1.test')
    for line in f_test.readlines():
        user, movie, rating, timestamp = line.split('\t') #都是str类型
        testSet.setdefault(user, {}) # 如果键不存在于字典中,将会添加键并将值设为默认值。
        testSet[user][movie] = rating # 两层字典的嵌套
    return trainSet,testSet

def read_data(filename,pivot):#针对完整的未进行划分的数据集
    trainSet = {}
    testSet = {}
    trainSet_len = 0
    testSet_len = 0
    for line in self.load_file(filename):
        user, movie, rating, timestamp = line.split(',') #\t或者空格或者其他
        if random.random() < pivot:
            trainSet.setdefault(user, {})
            trainSet[user][movie] = rating
            trainSet_len += 1
        else:
            testSet.setdefault(user, {})
            testSet[user][movie] = rating
            testSet_len += 1
    print('已划分')
    print('训练集长度 = %s' % trainSet_len)
    print('测试集长度 = %s' % testSet_len)
    return trainSet,testSet

#计算皮尔逊相似度
def simi_pearson(a,b):
    trainSet = read2_data()[0]  # 训练集
    users = trainSet.keys() # 所有的用户 非列表类型
    num_users = len(users) #用户总数
    #for user in users:
    #    print(user)
    # 应该不会出现在训练集中没有,在测试集该用户却出现的情况
    a_data = trainSet[a]
    b_data = trainSet[b]
    # print(a_data) 字典
    #print(type(a_data))
    #print(a_data.values())
    a_rating_sum = sum(float(a_data[i]) for i in a_data) # 计算字典的值的和 即评分之和
    #print(a_rating_sum)
    a_rating_num = len(a_data)
    #print(a_rating_num)
    b_rating_sum = sum(float(b_data[i]) for i in b_data)
    b_rating_num = len(b_data)
    a_average_rating = a_rating_sum / a_rating_num # a的均值
    b_average_rating = b_rating_sum / b_rating_num # b的均值
    x = 0
    y = 0
    for i in a_data.keys():#时间复杂度较高 n^2
        if i in b_data.keys():
            x += (float(a_data[i])-a_average_rating) * (float(b_data[i]) - b_average_rating)
    #x 可能是0 因为两个人可能没有观看相同的电影
    y = math.sqrt(sum(pow(float(a_data[i])-a_average_rating,2) for i in a_data)) * \
        math.sqrt(sum(pow(float(b_data[i])-b_average_rating,2) for i in b_data))
    if x:
        return x / y
    else:
        return 0


#通过余弦相似度计算a,b的相似度
def simi_cos(a,b):
    trainSet = read2_data()[0]  # 训练集
    users = trainSet.keys()  # 所有的用户 非列表类型
    num_users = len(users)  # 用户总数
    # for user in users:
    #    print(user)
    # 应该不会出现在训练集中没有,在测试集该用户却出现的情况
    a_data = trainSet[a]
    b_data = trainSet[b]
    # print(a_data) 字典
    # print(type(a_data))
    # print(a_data.values())
    a_rating_sum = sum(float(a_data[i]) for i in a_data)  # 计算字典的值的和 即评分之和
    # print(a_rating_sum)
    a_rating_num = len(a_data)
    # print(a_rating_num)
    b_rating_sum = sum(float(b_data[i]) for i in b_data)
    b_rating_num = len(b_data)
    a_average_rating = a_rating_sum / a_rating_num  # a的均值
    b_average_rating = b_rating_sum / b_rating_num  # b的均值
    x = 0
    y = 0
    for i in a_data.keys():  # 时间复杂度较高 n^2
        if i in b_data.keys():
            x += (float(a_data[i]) ) * (float(b_data[i]) )
    # x 可能是0 因为两个人可能没有观看相同的电影
    y = math.sqrt(sum(pow(float(a_data[i]) , 2) for i in a_data)) * \
        math.sqrt(sum(pow(float(b_data[i]) , 2) for i in b_data))
    if x:
        return x / y
    else:
        return 0

# 预估评分并求出RMSE
def error(k):
    list_MSE = []
    list_RMSE = []
    trainSet, testSet = read2_data() #读出训练集与测试集
    users = trainSet.keys()  # 所有的用户 非列表类型
    num_users = len(users)  # 用户总数
    movies = [] # 所有电影的列表
    for u in trainSet.keys():
        data = trainSet[u]
        for g in data:
            if g not in movies:
                movies.append(g)
    for u in testSet.keys():
        data = testSet[u]
        for g in data:
            if g not in movies:
                movies.append(g)
    #sorted(movies)
    #print(movies)
    #print(len(movies))



    for user in users: #对user缺少的项进行预测
        E = 0
        cnt = 0
        MSE = 0
        RMSE=0
        user_data = trainSet[user]
        simi_dict = {}
        for other_user in users:
            if user != other_user:
                simi_dict[other_user] = simi_pearson(user, other_user)
        sorted_simi_dict2list = sorted(simi_dict.items(), key=lambda x: x[1], reverse=True)[:k]  # 按照推荐度从高到低进行排序 返回的是列表
        print(sorted_simi_dict2list)

        sum_sim = 0
        for u_s in sorted_simi_dict2list:
            sum_sim += u_s[1]  # 相似度之和

        user_rating_sum = sum(float(user_data[i]) for i in user_data)  # 计算字典的值的和 即评分之和
        user_rating_num = len(user_data)

        for movie in movies:
            pre_rating = user_rating_sum / user_rating_num  # 平均值
            temp = 0
            if movie not in user_data.keys():
                cnt += 1
                for u_s in sorted_simi_dict2list:
                    if movie in trainSet[u_s[0]].keys():
                        temp += float(trainSet[u_s[0]][movie]) * u_s[1] #加权
                pre_rating += temp / sum_sim
                if user in testSet.keys():
                    if movie in testSet[user].keys():
                        MSE += abs(pre_rating-float(testSet[user][movie]))
                        RMSE += MSE**2
        MSE /= cnt
        RMSE /= cnt
        RMSE = math.sqrt(RMSE)
        list_MSE.append(MSE)
        list_RMSE.append(RMSE)
        print(MSE, RMSE)
    return list_MSE, list_RMSE

def recommend(user):
    pass
if __name__ == '__main__':
    pear_MSE,pear_RMSE = error(10)
    print('OK')

杰卡德相似度

import math


def read2_data():  # 针对已经分好的数据集
    trainSet = {}
    testSet = {}
    f_train = open('u1.base')
    # for line in f.readlines()[1:]:
    for line in f_train.readlines():
        user, movie, rating, timestamp = line.split('\t')
        trainSet.setdefault(user, {})  # 如果键不存在于字典中,将会添加键并将值设为默认值。
        trainSet[user][movie] = rating  # 两层字典的嵌套
    # 读测试集
    f_test = open('u1.test')
    for line in f_test.readlines():
        user, movie, rating, timestamp = line.split('\t')  # 都是str类型
        testSet.setdefault(user, {})  # 如果键不存在于字典中,将会添加键并将值设为默认值。
        testSet[user][movie] = rating  # 两层字典的嵌套
    return trainSet, testSet


def read_data(filename, pivot):  # 针对完整的未进行划分的数据集
    trainSet = {}
    testSet = {}
    trainSet_len = 0
    testSet_len = 0
    for line in self.load_file(filename):
        user, movie, rating, timestamp = line.split(',')  # \t或者空格或者其他
        if random.random() < pivot:
            trainSet.setdefault(user, {})
            trainSet[user][movie] = rating
            trainSet_len += 1
        else:
            testSet.setdefault(user, {})
            testSet[user][movie] = rating
            testSet_len += 1
    print('已划分')
    print('训练集长度 = %s' % trainSet_len)
    print('测试集长度 = %s' % testSet_len)
    return trainSet, testSet



def jacca(a,b):
    ab = 0
    trainSet = read2_data()[0]
    a_data = trainSet[a]
    b_data = trainSet[b]
    for i in a_data.keys():  # 时间复杂度较高 n^2
        if i in b_data.keys():
            ab += 1
    return ab/(len(a_data)+len(b_data)-ab)        



# 预估评分并求出RMSE
def error(k):
    list_MSE = []
    list_RMSE = []
    trainSet, testSet = read2_data()  # 读出训练集与测试集
    users = trainSet.keys()  # 所有的用户 非列表类型
    num_users = len(users)  # 用户总数
    print(num_users)
    movies = []  # 所有电影的列表
    for u in trainSet.keys():
        data = trainSet[u]
        for g in data:
            if g not in movies:
                movies.append(g)
    for u in testSet.keys():
        data = testSet[u]
        for g in data:
            if g not in movies:
                movies.append(g)
    #sorted(movies)
    #print(movies)
    #print(len(movies))

    for user in users:  # 对user缺少的项进行预测
        E = 0
        cnt = 0
        MSE = 0
        RMSE = 0
        user_data = trainSet[user]
        simi_dict = {}
        for other_user in users:
            if user != other_user:
                simi_dict[other_user] = jacca(user, other_user)
        sorted_simi_dict2list = sorted(simi_dict.items(), key=lambda x: x[1], reverse=True)[:k]  # 按照推荐度从高到低进行排序 返回的是列表
        print(sorted_simi_dict2list)

        sum_sim = 0
        for u_s in sorted_simi_dict2list:
            sum_sim += u_s[1]  # 相似度之和

        user_rating_sum = sum(float(user_data[i]) for i in user_data)  # 计算字典的值的和 即评分之和
        user_rating_num = len(user_data)

        for movie in movies:
            pre_rating = user_rating_sum / user_rating_num  # 平均值
            temp = 0
            if movie not in user_data.keys():
                cnt += 1
                for u_s in sorted_simi_dict2list:
                    if movie in trainSet[u_s[0]].keys():
                        temp += float(trainSet[u_s[0]][movie]) * u_s[1]  # 加权
                pre_rating += temp / sum_sim
                if user in testSet.keys():
                    if movie in testSet[user].keys():
                        MSE += abs(pre_rating - float(testSet[user][movie]))
                        RMSE += MSE ** 2
        MSE /= cnt
        RMSE /= cnt
        RMSE = math.sqrt(RMSE)
        list_MSE.append(MSE)
        list_RMSE.append(RMSE)
        print(MSE, RMSE)
    return list_MSE,list_RMSE


def recommend(user):
    pass

if __name__ == '__main__':
    jacca_MSE,jacca_RMSE = error(10)
    print('OK')

余弦相似度

import math


def read2_data():  # 针对已经分好的数据集
    trainSet = {}
    testSet = {}
    f_train = open('u1.base')
    # for line in f.readlines()[1:]:
    for line in f_train.readlines():
        user, movie, rating, timestamp = line.split('\t')
        trainSet.setdefault(user, {})  # 如果键不存在于字典中,将会添加键并将值设为默认值。
        trainSet[user][movie] = rating  # 两层字典的嵌套
    # 读测试集
    f_test = open('u1.test')
    for line in f_test.readlines():
        user, movie, rating, timestamp = line.split('\t')  # 都是str类型
        testSet.setdefault(user, {})  # 如果键不存在于字典中,将会添加键并将值设为默认值。
        testSet[user][movie] = rating  # 两层字典的嵌套
    return trainSet, testSet


def read_data(filename, pivot):  # 针对完整的未进行划分的数据集
    trainSet = {}
    testSet = {}
    trainSet_len = 0
    testSet_len = 0
    for line in self.load_file(filename):
        user, movie, rating, timestamp = line.split(',')  # \t或者空格或者其他
        if random.random() < pivot:
            trainSet.setdefault(user, {})
            trainSet[user][movie] = rating
            trainSet_len += 1
        else:
            testSet.setdefault(user, {})
            testSet[user][movie] = rating
            testSet_len += 1
    print('已划分')
    print('训练集长度 = %s' % trainSet_len)
    print('测试集长度 = %s' % testSet_len)
    return trainSet, testSet


# 计算皮尔逊相似度
def simi_pearson(a, b):
    trainSet = read2_data()[0]  # 训练集
    users = trainSet.keys()  # 所有的用户 非列表类型
    num_users = len(users)  # 用户总数
    # for user in users:
    #    print(user)
    # 应该不会出现在训练集中没有,在测试集该用户却出现的情况
    a_data = trainSet[a]
    b_data = trainSet[b]
    # print(a_data) 字典
    # print(type(a_data))
    # print(a_data.values())
    a_rating_sum = sum(float(a_data[i]) for i in a_data)  # 计算字典的值的和 即评分之和
    # print(a_rating_sum)
    a_rating_num = len(a_data)
    # print(a_rating_num)
    b_rating_sum = sum(float(b_data[i]) for i in b_data)
    b_rating_num = len(b_data)
    a_average_rating = a_rating_sum / a_rating_num  # a的均值
    b_average_rating = b_rating_sum / b_rating_num  # b的均值
    x = 0
    y = 0
    for i in a_data.keys():  # 时间复杂度较高 n^2
        if i in b_data.keys():
            x += (float(a_data[i]) - a_average_rating) * (float(b_data[i]) - b_average_rating)
    # x 可能是0 因为两个人可能没有观看相同的电影
    y = math.sqrt(sum(pow(float(a_data[i]) - a_average_rating, 2) for i in a_data)) * \
        math.sqrt(sum(pow(float(b_data[i]) - b_average_rating, 2) for i in b_data))
    if x:
        return x / y
    else:
        return 0

def jacca(a,b):
    ab = 0
    a_data = trainSet[a]
    b_data = trainSet[b]
    for i in a_data.keys():  # 时间复杂度较高 n^2
        if i in b_data.keys():
            ab += 1
    return ab/(len(a_data)+len(b_data)-ab)        

# 通过余弦相似度计算a,b的相似度
def simi_cos(a, b):
    trainSet = read2_data()[0]  # 训练集
    users = trainSet.keys()  # 所有的用户 非列表类型
    num_users = len(users)  # 用户总数
    # for user in users:
    #    print(user)
    # 应该不会出现在训练集中没有,在测试集该用户却出现的情况
    a_data = trainSet[a]
    b_data = trainSet[b]
    # print(a_data) 字典
    # print(type(a_data))
    # print(a_data.values())
    a_rating_sum = sum(float(a_data[i]) for i in a_data)  # 计算字典的值的和 即评分之和
    # print(a_rating_sum)
    a_rating_num = len(a_data)
    # print(a_rating_num)
    b_rating_sum = sum(float(b_data[i]) for i in b_data)
    b_rating_num = len(b_data)
    a_average_rating = a_rating_sum / a_rating_num  # a的均值
    b_average_rating = b_rating_sum / b_rating_num  # b的均值
    x = 0
    y = 0
    for i in a_data.keys():  # 时间复杂度较高 n^2
        if i in b_data.keys():
            x += (float(a_data[i])) * (float(b_data[i]))
    # x 可能是0 因为两个人可能没有观看相同的电影
    y = math.sqrt(sum(pow(float(a_data[i]), 2) for i in a_data)) * \
        math.sqrt(sum(pow(float(b_data[i]), 2) for i in b_data))
    if x:
        return x / y
    else:
        return 0


# 预估评分并求出RMSE
def error(k):
    list_MSE = []
    list_RMSE = []
    trainSet, testSet = read2_data()  # 读出训练集与测试集
    users = trainSet.keys()  # 所有的用户 非列表类型
    num_users = len(users)  # 用户总数
    movies = []  # 所有电影的列表
    for u in trainSet.keys():
        data = trainSet[u]
        for g in data:
            if g not in movies:
                movies.append(g)
    for u in testSet.keys():
        data = testSet[u]
        for g in data:
            if g not in movies:
                movies.append(g)
    #sorted(movies)
    #print(movies)
    #print(len(movies))

    for user in users:  # 对user缺少的项进行预测
        E = 0
        cnt = 0
        MSE = 0
        RMSE = 0
        user_data = trainSet[user]
        simi_dict = {}
        for other_user in users:
            if user != other_user:
                simi_dict[other_user] = simi_cos(user, other_user)
        sorted_simi_dict2list = sorted(simi_dict.items(), key=lambda x: x[1], reverse=True)[:k]  # 按照推荐度从高到低进行排序 返回的是列表
        print(sorted_simi_dict2list)

        sum_sim = 0
        for u_s in sorted_simi_dict2list:
            sum_sim += u_s[1]  # 相似度之和

        user_rating_sum = sum(float(user_data[i]) for i in user_data)  # 计算字典的值的和 即评分之和
        user_rating_num = len(user_data)

        for movie in movies:
            pre_rating = user_rating_sum / user_rating_num  # 平均值
            temp = 0
            if movie not in user_data.keys():
                cnt += 1
                for u_s in sorted_simi_dict2list:
                    if movie in trainSet[u_s[0]].keys():
                        temp += float(trainSet[u_s[0]][movie]) * u_s[1]  # 加权
                pre_rating += temp / sum_sim
                if user in testSet.keys():
                    if movie in testSet[user].keys():
                        MSE += abs(pre_rating - float(testSet[user][movie]))
                        RMSE += MSE ** 2
        MSE /= cnt
        RMSE /= cnt
        RMSE = math.sqrt(RMSE)
        list_MSE.append(MSE)
        list_RMSE.append(RMSE)
        print(MSE,RMSE)
    return list_MSE,list_RMSE


def recommend(user):
    pass
if __name__ == '__main__':
    cos_MSE,cos_RMSE = error(10)
    print('OK')

 

你可能感兴趣的:(推荐系统)