一. 实验内容
1、采用二分网络模型,对文件夹中的“用户---电影”打分数据进行建模,考虑将用户信息、电影详细信息、以及打分分值作为该网络上的边、点的权重;
2、根据网络结构特征给出节点相似性度量指标;
3、基于相似性在二分网络上进行链路预测;
4、采用交叉验证的方法验证预测结果;
5、画出ROC曲线来度量预测方法的准确性。
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import auc
import os
import pylab as mpl
from sklearn.utils import shuffle
mpl.rcParams["font.sans-serif"] = ["SimHei"]
mpl.rcParams["axes.unicode_minus"] = False
def compute_r_mat(w, mat_test_rat, user_count, movie_count, k_user):
f = np.dot(w, mat_test_rat.T).T
R = f.copy()
R_sort_index = np.argsort(R, axis=1) # 这里的索引是从小到达排的 及第i个存的是第i小的电影的索引值
R_sort = np.zeros_like(R)
r = np.zeros((len(user_count), 1))
for i in range(len(user_count)):
if k_user[i]:
for j in range(len(movie_count)):
R_sort[i, R_sort_index[i][j]] = len(movie_count) - j # 计算每个电影的排名
L = len(movie_count) - k_user
L[L == len(movie_count)] = 1
R_sort = R_sort * mat_test_rat # 只对用户打过分的电影进行准确度评判
# 这里不用average函数是因为它会将很多零元素也参与求平均运算中
for i in range(len(user_count)):
if k_user[i]:
r[i] = sum(R_sort[i]) / k_user[i] / L[i] # 准确度R的计算
b = np.count_nonzero(r)
print("非零个数:", b)
a = sum(r)
print("元素和:", a)
r_avg = a / b
return r_avg
def compute_w_mat(mat_rat, user_count, movie_count):
temp = (mat_rat / user_count.reshape([-1, 1])) / movie_count.reshape([1, -1])
W = np.dot(mat_rat.T, temp)
return W
def roc_pic(t, user_count, a, b, num=80):
threshold_rate = np.linspace(0, 1, num)
sort_result = np.argsort(-t, axis=1) # 找出每个电影的排名 从大到小记录索引
th_fprs = np.zeros(num)
th_tprs = np.zeros(num)
for i, q in enumerate(threshold_rate):
recommond_num = int(a.shape[1] * q)
fprs = np.zeros(user_count.shape[0])
tprs = np.zeros(user_count.shape[0])
ans1, ans2 = 0, 0
index1, index2 = 0, 0
for user in range(user_count.shape[0]):
recommond_movie = sort_result[user, 0:recommond_num] # 推荐电影的数目下的所有电影(为算法算出)
user_like = np.where(a[user, :] == 1)[0] # 找出真实的喜欢矩阵中的索引
user_dislike = np.where(b[user, :] == 1)[0] # 找出真实的不喜欢矩阵的中的索引
like = np.intersect1d(recommond_movie, user_like) # 将索引相交 求得正确预测值
dis_like = np.intersect1d(recommond_movie, user_dislike)
if len(user_dislike) == 0:
fprs[user] = 0
else:
fprs[user] = len(dis_like) / len(user_dislike) # 算得不喜欢占真实不喜欢得比例
ans1 = ans1 + fprs[user]
index1 = index1 + 1
if len(user_like) == 0:
tprs[user] = 0
else:
tprs[user] = len(like) / len(user_like) # 算得喜欢占真实喜欢的比例
ans2 = ans2 + tprs[user]
index2 = index2 + 1
th_fprs[i] = ans1 / index1
th_tprs[i] = ans2 / index2
roc_auc = auc(th_fprs, th_tprs) # compute the roc value
lw = 2
plt.plot(th_fprs, th_tprs,
lw=lw, label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='aqua', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC曲线图')
plt.legend(loc="lower right")
return
if __name__ == '__main__':
# 导入数据
curpath = os.path.abspath('.') # 当前地址
filename = curpath + "\\ratings.dat"
all_ratings = pd.read_csv(filename, header=None, sep="::", names=["UserId", "MovieId", "Rating", "Datetime"],
engine="python") # 读入具体数据
res = 0.0
'''
#print1
#shape of all_ratings and The first several lines of all_ratings
print(all_ratings.shape)
print(all_ratings.head())
'''
# k折交叉验证
k = 10 # k折
k_sample_count = all_ratings.shape[0] // k # 每折多少行数据(这里使用整除)
all_ratings = shuffle(all_ratings)
# 根据k折,划分数据集
plt.figure(figsize=(10, 10)) # 绘制图形
for fold in range(k):
validation_begin = k_sample_count * fold
validation_end = k_sample_count * (fold + 1)
print(f"第{fold + 1}折", validation_begin, validation_end)
# 验证集(或者叫测试集)
test_ratings = all_ratings[validation_begin:validation_end]
# 训练集,pd.concat 沿着垂直的方向堆叠数据,拼接得到训练集
train_ratings = pd.concat([
all_ratings[:validation_begin],
all_ratings[validation_end:]
])
userId_col = all_ratings['UserId'] # 构建所有用户的集合
movieId_col = all_ratings['MovieId'] # 构建所有电影的集合
user_count = np.array(
userId_col.value_counts()) # 计数
movie_count = np.array(
movieId_col.value_counts()) # 计数
movie_index = np.array(movieId_col.value_counts().index)
userId_max = user_count.shape[0] # 所有的用户
movieId_max = movie_count.shape[0] # 电影
mat = np.zeros([userId_max, movieId_max]) # 创建用户和电影训练矩阵
mat_test = np.zeros([userId_max, movieId_max]) # 创建用户和电影测试矩阵
mat_all = np.zeros([userId_max, movieId_max]) # 创建用户和电影所有
# 全集
for row in all_ratings.itertuples(index=True, name='Pandas'):
mat_all[row.UserId - 1, np.where(movie_index == row.MovieId)[0][0]] = row.Rating
mat_all_like = (mat_all >= 3) + 0 # 全集的喜欢电影矩阵
mat_all = (mat_all > 0) + 0
mat_all[mat_all > 0] = 1 # 全集的评价矩阵只要评价过则有连线
# 训练集
for row in train_ratings.itertuples(index=True, name='Pandas'):
mat[row.UserId - 1, np.where(movie_index == row.MovieId)[0][0]] = row.Rating
# 当评分小于三时认为不喜欢该电影
threshold = 3
mat_like = (mat >= threshold) + 0
mat_dislike = ((mat > 0) + 0) * ((mat < threshold) + 0)
# 测试集
for row in test_ratings.itertuples(index=True, name='Pandas'):
mat_test[row.UserId - 1, np.where(movie_index == row.MovieId)[0][0]] = row.Rating
mat_test_all = (mat_test > 0) + 0
mat_test_all[mat_test_all > 0] = 1
k_user = mat_test_all.sum(axis=1) # 找出测试集的每个用户及其评价电影的数量
mat_test_like = (mat_test >= threshold) + 0 # 测试集的喜欢矩阵
mat_test_dislike = ((mat_test > 0) + 0) * ((mat_test < threshold) + 0) # 测试集的不喜欢矩阵
W = compute_w_mat(mat_all, user_count, movie_count)
f_mat = np.dot(W, mat_all_like.T).T
f_mat1 = np.dot(W, mat_test_like.T).T
r = compute_r_mat(W, mat_test_all, user_count, movie_count, k_user)
print("结果:", format(float(r), '.10f'))
res = r + res
roc_pic(f_mat1, user_count, mat_test_like, mat_test_dislike)
res = res / 10
print("最终结果:", format(float(res), '.10f'))
plt.show()
# ROC曲线
# roc_pic(f_mat, user_count, mat_test_like, mat_test_dislike)