西电数据挖掘作业1.二分网络上的链路预测

 

一. 实验内容

1、采用二分网络模型,对文件夹中的“用户---电影”打分数据进行建模,考虑将用户信息、电影详细信息、以及打分分值作为该网络上的边、点的权重;

2、根据网络结构特征给出节点相似性度量指标;

3、基于相似性在二分网络上进行链路预测;

4、采用交叉验证的方法验证预测结果;

5、画出ROC曲线来度量预测方法的准确性。

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import auc
import os
import pylab as mpl
from sklearn.utils import shuffle

mpl.rcParams["font.sans-serif"] = ["SimHei"]
mpl.rcParams["axes.unicode_minus"] = False


def compute_r_mat(w, mat_test_rat, user_count, movie_count, k_user):
    f = np.dot(w, mat_test_rat.T).T
    R = f.copy()
    R_sort_index = np.argsort(R, axis=1)  # 这里的索引是从小到达排的 及第i个存的是第i小的电影的索引值
    R_sort = np.zeros_like(R)
    r = np.zeros((len(user_count), 1))
    for i in range(len(user_count)):
        if k_user[i]:
            for j in range(len(movie_count)):
                R_sort[i, R_sort_index[i][j]] = len(movie_count) - j  # 计算每个电影的排名
    L = len(movie_count) - k_user
    L[L == len(movie_count)] = 1
    R_sort = R_sort * mat_test_rat  # 只对用户打过分的电影进行准确度评判
    # 这里不用average函数是因为它会将很多零元素也参与求平均运算中
    for i in range(len(user_count)):
        if k_user[i]:
            r[i] = sum(R_sort[i]) / k_user[i] / L[i]  # 准确度R的计算
    b = np.count_nonzero(r)
    print("非零个数:", b)
    a = sum(r)
    print("元素和:", a)
    r_avg = a / b
    return r_avg


def compute_w_mat(mat_rat, user_count, movie_count):
    temp = (mat_rat / user_count.reshape([-1, 1])) / movie_count.reshape([1, -1])
    W = np.dot(mat_rat.T, temp)
    return W


def roc_pic(t, user_count, a, b, num=80):
    threshold_rate = np.linspace(0, 1, num)
    sort_result = np.argsort(-t, axis=1)  # 找出每个电影的排名 从大到小记录索引
    th_fprs = np.zeros(num)
    th_tprs = np.zeros(num)
    for i, q in enumerate(threshold_rate):
        recommond_num = int(a.shape[1] * q)
        fprs = np.zeros(user_count.shape[0])
        tprs = np.zeros(user_count.shape[0])
        ans1, ans2 = 0, 0
        index1, index2 = 0, 0
        for user in range(user_count.shape[0]):
            recommond_movie = sort_result[user, 0:recommond_num]  # 推荐电影的数目下的所有电影(为算法算出)
            user_like = np.where(a[user, :] == 1)[0]  # 找出真实的喜欢矩阵中的索引
            user_dislike = np.where(b[user, :] == 1)[0]  # 找出真实的不喜欢矩阵的中的索引
            like = np.intersect1d(recommond_movie, user_like)   # 将索引相交 求得正确预测值
            dis_like = np.intersect1d(recommond_movie, user_dislike)
            if len(user_dislike) == 0:
                fprs[user] = 0
            else:
                fprs[user] = len(dis_like) / len(user_dislike)  #  算得不喜欢占真实不喜欢得比例
                ans1 = ans1 + fprs[user]
                index1 = index1 + 1
            if len(user_like) == 0:
                tprs[user] = 0
            else:
                tprs[user] = len(like) / len(user_like)  # 算得喜欢占真实喜欢的比例
                ans2 = ans2 + tprs[user]
                index2 = index2 + 1
        th_fprs[i] = ans1 / index1
        th_tprs[i] = ans2 / index2
    roc_auc = auc(th_fprs, th_tprs)  # compute the roc value
    lw = 2

    plt.plot(th_fprs, th_tprs,
             lw=lw, label='ROC curve (area = %0.3f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='aqua', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('ROC曲线图')
    plt.legend(loc="lower right")

    return


if __name__ == '__main__':
    # 导入数据

    curpath = os.path.abspath('.')  # 当前地址
    filename = curpath + "\\ratings.dat"
    all_ratings = pd.read_csv(filename, header=None, sep="::", names=["UserId", "MovieId", "Rating", "Datetime"],
                              engine="python")  # 读入具体数据
    res = 0.0
    '''
    #print1
    #shape of all_ratings and The first several lines of all_ratings
    print(all_ratings.shape)
    print(all_ratings.head())
    '''

    # k折交叉验证
    k = 10  # k折
    k_sample_count = all_ratings.shape[0] // k  # 每折多少行数据(这里使用整除)
    all_ratings = shuffle(all_ratings)
    # 根据k折,划分数据集
    plt.figure(figsize=(10, 10))  # 绘制图形
    for fold in range(k):
        validation_begin = k_sample_count * fold
        validation_end = k_sample_count * (fold + 1)
        print(f"第{fold + 1}折", validation_begin, validation_end)
        # 验证集(或者叫测试集)
        test_ratings = all_ratings[validation_begin:validation_end]
        # 训练集,pd.concat 沿着垂直的方向堆叠数据,拼接得到训练集
        train_ratings = pd.concat([
            all_ratings[:validation_begin],
            all_ratings[validation_end:]
        ])

        userId_col = all_ratings['UserId']  # 构建所有用户的集合
        movieId_col = all_ratings['MovieId']  # 构建所有电影的集合

        user_count = np.array(
            userId_col.value_counts())  # 计数
        movie_count = np.array(
            movieId_col.value_counts())  # 计数
        movie_index = np.array(movieId_col.value_counts().index)

        userId_max = user_count.shape[0]  # 所有的用户
        movieId_max = movie_count.shape[0]  # 电影

        mat = np.zeros([userId_max, movieId_max])  # 创建用户和电影训练矩阵
        mat_test = np.zeros([userId_max, movieId_max])  # 创建用户和电影测试矩阵
        mat_all = np.zeros([userId_max, movieId_max])  # 创建用户和电影所有
        # 全集
        for row in all_ratings.itertuples(index=True, name='Pandas'):
            mat_all[row.UserId - 1, np.where(movie_index == row.MovieId)[0][0]] = row.Rating
        mat_all_like = (mat_all >= 3) + 0  # 全集的喜欢电影矩阵
        mat_all = (mat_all > 0) + 0
        mat_all[mat_all > 0] = 1  # 全集的评价矩阵只要评价过则有连线

        #  训练集
        for row in train_ratings.itertuples(index=True, name='Pandas'):
            mat[row.UserId - 1, np.where(movie_index == row.MovieId)[0][0]] = row.Rating
        # 当评分小于三时认为不喜欢该电影
        threshold = 3
        mat_like = (mat >= threshold) + 0
        mat_dislike = ((mat > 0) + 0) * ((mat < threshold) + 0)
        # 测试集
        for row in test_ratings.itertuples(index=True, name='Pandas'):
            mat_test[row.UserId - 1, np.where(movie_index == row.MovieId)[0][0]] = row.Rating
        mat_test_all = (mat_test > 0) + 0
        mat_test_all[mat_test_all > 0] = 1
        k_user = mat_test_all.sum(axis=1)  # 找出测试集的每个用户及其评价电影的数量
        mat_test_like = (mat_test >= threshold) + 0  # 测试集的喜欢矩阵
        mat_test_dislike = ((mat_test > 0) + 0) * ((mat_test < threshold) + 0)  # 测试集的不喜欢矩阵
        W = compute_w_mat(mat_all, user_count, movie_count)
        f_mat = np.dot(W, mat_all_like.T).T
        f_mat1 = np.dot(W, mat_test_like.T).T
        r = compute_r_mat(W, mat_test_all, user_count, movie_count, k_user)
        print("结果:", format(float(r), '.10f'))
        res = r + res

        roc_pic(f_mat1, user_count, mat_test_like, mat_test_dislike)
    res = res / 10
    print("最终结果:", format(float(res), '.10f'))
    plt.show()
    # ROC曲线
    # roc_pic(f_mat, user_count, mat_test_like, mat_test_dislike)
  • 结果西电数据挖掘作业1.二分网络上的链路预测_第1张图片

西电数据挖掘作业1.二分网络上的链路预测_第2张图片 

 

你可能感兴趣的:(机器学习,人工智能)