复现KGAT: Knowledge Graph Attention Network for Recommendation(五)

复现KGAT: Knowledge Graph Attention Network for Recommendation(五)

4.19 打开这堆代码看了半天,真TMD的难啊,这是我动手复现的第一篇推荐论文。上一个看过代码的是NCF,那个难度简直没法和这个相提并论,准确的说,有点机器学习、深度学习基础的那个代码看起来分分钟的事。
这篇。。。对我这个新手很不友好啊。。。

看了半天,从读懂utility开始吧,要不然读主干读着读着还是得回来读它,读完了他主干又忘了。
看了一晚上了,想哭。。。

4.20 昨天和今天心情都很不好,绝对是因为这个贼难的KGAT。让我看啥啥不懂,运行还运行不出来。。。今天到现在(15:17)唯一的收获是解决了那个除以0的问题,实际上人家是个warning,是我智障了。。。还有,不读utility了,大概看了utility中的load_data和loader_bprmf、laoder_kgat。有问题还是应该直接面对主要问题,在主要问题旁边绕来绕去都是浪费时间,emmm也不能这么说,总是有点收获的。今天直攻kgat吧。

先发个昨天读的load_data和loader_bprmf,解释写在代码里了。kgat的内容从下一篇(六)开始写啦。

一、Utility

1.1 load_data.py

'''
Created on Dec 18, 2018
Tensorflow Implementation of Knowledge Graph Attention Network (KGAT) model in:
Wang Xiang et al. KGAT: Knowledge Graph Attention Network for Recommendation. In KDD 2019.
@author: Xiang Wang (xiangwang@u.nus.edu)
'''
import collections
import numpy as np
import random as rd

class Data(object):
    #初始化对象
    #Data.path
    #Data.args
    #Data.batch_size

    #train_file:训练数据。每一行是用户userID和他的正样本物品(itemID)的列表
    #test_file:测试数据,每一行是userID和他的正样本(itemID)的列表,所有未观测到的都定为负样本
    #kg_file:应该是每一行是一个(head,relation,tail)

    #train_data,train_user_dict 训练数据中的交互列表,以用户为键的字典
    #test_data,test_user_dict  测试数据的交互列表,以用户为键的字典

    #n_users:用户id最大值
    #n_items:物品id最大值
    #n_train:训练样本数
    #n_test: 测试样本数

    #关系的最大值:n_relations
    #实体的最大值:n_entities
    #三元组的个数:n_triples

    # kg_dict:字典,以head为键,(tail,relation)为值
    # relation_dict:字典,以relation为键,(head,tail)为值

    #batch_size是输入的参数
    #batch_size_kg是kg_final中不重复的三元组个数

    def __init__(self, args, path):
        self.path = path
        self.args = args

        self.batch_size = args.batch_size

        train_file = path + '/train.txt'
        test_file = path + '/test.txt'

        kg_file = path + '/kg_final.txt'

        # ----------get number of users and items & then load rating data from train_file & test_file------------.
        self.n_train, self.n_test = 0, 0
        self.n_users, self.n_items = 0, 0

        #train_data、test_data是一个np数组,里面包括了这个数据文件中的所有交互信息,
        #每一个交互以[userID,itemID]的形式存储,最后整体为np.array
        #train_uesr_dict和test_user_dict 是字典形式
        #键为用户的userID,值对应的是和他有交互的列表(无重复)
        self.train_data, self.train_user_dict = self._load_ratings(train_file)
        self.test_data, self.test_user_dict = self._load_ratings(test_file)
        #exist_users是训练数据的字典形式的键的列表,也就是全部训练数据中的用户id(userID)
        self.exist_users = self.train_user_dict.keys()


        #_statistic_ratings()求出了用户总数n_users和物品总数n_items
        #和训练样本数n_train、测试样本数n_test
        self._statistic_ratings()

        # ----------get number of entities and relations & then load kg data from kg_file ------------.
        self.n_relations, self.n_entities, self.n_triples = 0, 0, 0
        self.kg_data, self.kg_dict, self.relation_dict = self._load_kg(kg_file)

        # ----------print the basic info about the dataset-------------.
        self.batch_size_kg = self.n_triples // (self.n_train // self.batch_size)
        self._print_data_info()

    # reading train & test interaction data.
    def _load_ratings(self, file_name):
        user_dict = dict()
        inter_mat = list()

        lines = open(file_name, 'r').readlines()
        for l in lines:
            tmps = l.strip()
            inters = [int(i) for i in tmps.split(' ')]

            u_id, pos_ids = inters[0], inters[1:]
            pos_ids = list(set(pos_ids))

            for i_id in pos_ids:
                inter_mat.append([u_id, i_id])

            if len(pos_ids) > 0:
                user_dict[u_id] = pos_ids
        return np.array(inter_mat), user_dict

    def _statistic_ratings(self):
        self.n_users = max(max(self.train_data[:, 0]), max(self.test_data[:, 0])) + 1
        self.n_items = max(max(self.train_data[:, 1]), max(self.test_data[:, 1])) + 1
        self.n_train = len(self.train_data)
        self.n_test = len(self.test_data)

    # reading train & test interaction data.
    def _load_kg(self, file_name):
        def _construct_kg(kg_np):
            #collections.defaultdict生成一种字典类型
            kg = collections.defaultdict(list)
            rd = collections.defaultdict(list)

            for head, relation, tail in kg_np:
                kg[head].append((tail, relation))
                rd[relation].append((head, tail))
            return kg, rd

        kg_np = np.loadtxt(file_name, dtype=np.int32)
        kg_np = np.unique(kg_np, axis=0)

        # self.n_relations = len(set(kg_np[:, 1]))
        # self.n_entities = len(set(kg_np[:, 0]) | set(kg_np[:, 2]))
        #关系的最大值:n_relations
        #实体的最大值:n_entities
        #三元组的个数:n_triples
        self.n_relations = max(kg_np[:, 1]) + 1
        self.n_entities = max(max(kg_np[:, 0]), max(kg_np[:, 2])) + 1
        self.n_triples = len(kg_np)

        #kg_np是直接读出来的kg_final原始数据
        #kg_dict:以head为键,(tail,relation)为值
        #relation_dict:以relation为键,(head,tail)为值
        kg_dict, relation_dict = _construct_kg(kg_np)

        return kg_np, kg_dict, relation_dict

    #输出数据信息
    def _print_data_info(self):
        print('[n_users, n_items]=[%d, %d]' % (self.n_users, self.n_items))
        print('[n_train, n_test]=[%d, %d]' % (self.n_train, self.n_test))
        print('[n_entities, n_relations, n_triples]=[%d, %d, %d]' % (self.n_entities, self.n_relations, self.n_triples))
        print('[batch_size, batch_size_kg]=[%d, %d]' % (self.batch_size, self.batch_size_kg))

    #返回一个满足batch_size大小的users
    #和每个user对应的正样本(按照user的顺序保存在pos_items中)
    #和负样本(按照user的顺序保存在neg_items中)
    def _generate_train_cf_batch(self):
        #生成batch_size大小的用户列表
        if self.batch_size <= self.n_users:
            #从列表中按照给定参数抽样,在原列表中的顺序不变,返回一个新的列表
            users = rd.sample(self.exist_users, self.batch_size)
        else:
            #rd.choice 从列表中随机选一个,后面跟着一个for循环,就是在exist_users中进行batch_size次的抽样
            users = [rd.choice(self.exist_users) for _ in range(self.batch_size)]

        #获取用户u指定大小为num的正样本集合
        def sample_pos_items_for_u(u, num):
            #pos_item是u的正样本组成的集合
            pos_items = self.train_user_dict[u]
            #n_pos_item 是pos_item的个数,也就是该用户u的正样本个数
            n_pos_items = len(pos_items)
            #pos_batch是用户u指定大小为num的正样本集合,从train_user_dict根据键值为u取出来的pos构造的
            pos_batch = []
            while True:
                #获取大小为指定的num的子集
                if len(pos_batch) == num: break
                #pos_id:在大小n_pos_items中随机抽取一个数,也就是位置
                pos_id = np.random.randint(low=0, high=n_pos_items, size=1)[0]
                #pos_i_id是pos_item中这个pos_id对应的itemid
                pos_i_id = pos_items[pos_id]
                #如果上面抽取的pos_i_id不在已有的pos_batch中,则把它加入进去
                if pos_i_id not in pos_batch:
                    pos_batch.append(pos_i_id)

            return pos_batch

        #获取用户u的大小为num的负样本集合
        def sample_neg_items_for_u(u, num):
            #neg_items作为装载负样本的列表
            neg_items = []
            while True:
                if len(neg_items) == num: break
                #n_items为用户id最大值,neg_i_id为随机取的pos index
                neg_i_id = np.random.randint(low=0, high=self.n_items,size=1)[0]
                #判断:如果neg_i_id不在u的正样本列表里,也不在已经采集好的neg_items中
                #那么把他添加到neg_items中
                if neg_i_id not in self.train_user_dict[u] and neg_i_id not in neg_items:
                    neg_items.append(neg_i_id)
            return neg_items

        #对于每个用户u都采集相应的正样本和负样本并最终添加到pos_items和neg_items中
        pos_items, neg_items = [], []
        for u in users:
            pos_items += sample_pos_items_for_u(u, 1)
            neg_items += sample_neg_items_for_u(u, 1)
        #返回一个满足batch_size大小的users和每个user对应的正样本(按照user的顺序保存在pos_items中)和负样本(按照user的顺序保存在neg_items中)
        return users, pos_items, neg_items



    def get_sparsity_split(self):
        try:
            split_uids, split_state = [], []
            lines = open(self.path + '/sparsity.split', 'r').readlines()

            for idx, line in enumerate(lines):
                if idx % 2 == 0:
                    split_state.append(line.strip())
                    print(line.strip())
                else:
                    split_uids.append([int(uid) for uid in line.strip().split(' ')])
            print('get sparsity split.')

        except Exception:
            split_uids, split_state = self.create_sparsity_split()
            f = open(self.path + '/sparsity.split', 'w')
            for idx in range(len(split_state)):
                f.write(split_state[idx] + '\n')
                f.write(' '.join([str(uid) for uid in split_uids[idx]]) + '\n')
            print('create sparsity split.')

        return split_uids, split_state



    def create_sparsity_split(self):
        all_users_to_test = list(self.test_user_dict.keys())
        user_n_iid = dict()

        # generate a dictionary to store (key=n_iids, value=a list of uid).
        for uid in all_users_to_test:
            train_iids = self.train_user_dict[uid]
            test_iids = self.test_user_dict[uid]

            n_iids = len(train_iids) + len(test_iids)

            if n_iids not in user_n_iid.keys():
                user_n_iid[n_iids] = [uid]
            else:
                user_n_iid[n_iids].append(uid)
        split_uids = list()

        # split the whole user set into four subset.
        temp = []
        count = 1
        fold = 4
        n_count = (self.n_train + self.n_test)
        n_rates = 0

        split_state = []
        for idx, n_iids in enumerate(sorted(user_n_iid)):
            temp += user_n_iid[n_iids]
            n_rates += n_iids * len(user_n_iid[n_iids])
            n_count -= n_iids * len(user_n_iid[n_iids])

            if n_rates >= count * 0.25 * (self.n_train + self.n_test):
                split_uids.append(temp)

                state = '#inter per user<=[%d], #users=[%d], #all rates=[%d]' %(n_iids, len(temp), n_rates)
                split_state.append(state)
                print(state)

                temp = []
                n_rates = 0
                fold -= 1

            if idx == len(user_n_iid.keys()) - 1 or n_count == 0:
                split_uids.append(temp)

                state = '#inter per user<=[%d], #users=[%d], #all rates=[%d]' % (n_iids, len(temp), n_rates)
                split_state.append(state)
                print(state)


        return split_uids, split_state

2.2 loader_bprmf

'''
Created on Dec 18, 2018
Tensorflow Implementation of the Baseline Model, BPRMF, in:
Wang Xiang et al. KGAT: Knowledge Graph Attention Network for Recommendation. In KDD 2019.
@author: Xiang Wang (xiangwang@u.nus.edu)
'''
from utility.load_data import Data

#创建BPRMF数据加载类,继承自Data类
class BPRMF_loader(Data):
    def __init__(self, args, path):
        #继承了父类的init初始化
        super().__init__(args, path)

    #获得train_batch
    #生成batch_data(字典),键为users、pos_items,neg_items,
    #与Data一样,值为Data生成的大小为batch_size的用户列表,以及每个用户的正样本集合与负样本集合
    #返回这个batch_data的字典
    def generate_train_batch(self):
        users, pos_items, neg_items = self._generate_train_cf_batch()

        batch_data = {}
        batch_data['users'] = users
        batch_data['pos_items'] = pos_items
        batch_data['neg_items'] = neg_items

        return batch_data

    #生成tf的BPRMFfeed_dict
    #将batch_data的信息匹配到feed_dict中
    def generate_train_feed_dict(self, model, batch_data):
        feed_dict = {
            model.users: batch_data['users'],
            model.pos_items: batch_data['pos_items'],
            model.neg_items: batch_data['neg_items']
        }

        return feed_dict

    #生成测试的feed_dict
    def generate_test_feed_dict(self, model, user_batch, item_batch, drop_flag=False):
        feed_dict = {
            model.users: user_batch,
            model.pos_items: item_batch
        }
        return feed_dict


总结

今天没有总结

你可能感兴趣的:(复现KGAT: Knowledge Graph Attention Network for Recommendation(五))