4.19 打开这堆代码看了半天,真TMD的难啊,这是我动手复现的第一篇推荐论文。上一个看过代码的是NCF,那个难度简直没法和这个相提并论,准确的说,有点机器学习、深度学习基础的那个代码看起来分分钟的事。
这篇。。。对我这个新手很不友好啊。。。
看了半天,从读懂utility开始吧,要不然读主干读着读着还是得回来读它,读完了他主干又忘了。
看了一晚上了,想哭。。。
4.20 昨天和今天心情都很不好,绝对是因为这个贼难的KGAT。让我看啥啥不懂,运行还运行不出来。。。今天到现在(15:17)唯一的收获是解决了那个除以0的问题,实际上人家是个warning,是我智障了。。。还有,不读utility了,大概看了utility中的load_data和loader_bprmf、laoder_kgat。有问题还是应该直接面对主要问题,在主要问题旁边绕来绕去都是浪费时间,emmm也不能这么说,总是有点收获的。今天直攻kgat吧。
先发个昨天读的load_data和loader_bprmf,解释写在代码里了。kgat的内容从下一篇(六)开始写啦。
'''
Created on Dec 18, 2018
Tensorflow Implementation of Knowledge Graph Attention Network (KGAT) model in:
Wang Xiang et al. KGAT: Knowledge Graph Attention Network for Recommendation. In KDD 2019.
@author: Xiang Wang (xiangwang@u.nus.edu)
'''
import collections
import numpy as np
import random as rd
class Data(object):
#初始化对象
#Data.path
#Data.args
#Data.batch_size
#train_file:训练数据。每一行是用户userID和他的正样本物品(itemID)的列表
#test_file:测试数据,每一行是userID和他的正样本(itemID)的列表,所有未观测到的都定为负样本
#kg_file:应该是每一行是一个(head,relation,tail)
#train_data,train_user_dict 训练数据中的交互列表,以用户为键的字典
#test_data,test_user_dict 测试数据的交互列表,以用户为键的字典
#n_users:用户id最大值
#n_items:物品id最大值
#n_train:训练样本数
#n_test: 测试样本数
#关系的最大值:n_relations
#实体的最大值:n_entities
#三元组的个数:n_triples
# kg_dict:字典,以head为键,(tail,relation)为值
# relation_dict:字典,以relation为键,(head,tail)为值
#batch_size是输入的参数
#batch_size_kg是kg_final中不重复的三元组个数
def __init__(self, args, path):
self.path = path
self.args = args
self.batch_size = args.batch_size
train_file = path + '/train.txt'
test_file = path + '/test.txt'
kg_file = path + '/kg_final.txt'
# ----------get number of users and items & then load rating data from train_file & test_file------------.
self.n_train, self.n_test = 0, 0
self.n_users, self.n_items = 0, 0
#train_data、test_data是一个np数组,里面包括了这个数据文件中的所有交互信息,
#每一个交互以[userID,itemID]的形式存储,最后整体为np.array
#train_uesr_dict和test_user_dict 是字典形式
#键为用户的userID,值对应的是和他有交互的列表(无重复)
self.train_data, self.train_user_dict = self._load_ratings(train_file)
self.test_data, self.test_user_dict = self._load_ratings(test_file)
#exist_users是训练数据的字典形式的键的列表,也就是全部训练数据中的用户id(userID)
self.exist_users = self.train_user_dict.keys()
#_statistic_ratings()求出了用户总数n_users和物品总数n_items
#和训练样本数n_train、测试样本数n_test
self._statistic_ratings()
# ----------get number of entities and relations & then load kg data from kg_file ------------.
self.n_relations, self.n_entities, self.n_triples = 0, 0, 0
self.kg_data, self.kg_dict, self.relation_dict = self._load_kg(kg_file)
# ----------print the basic info about the dataset-------------.
self.batch_size_kg = self.n_triples // (self.n_train // self.batch_size)
self._print_data_info()
# reading train & test interaction data.
def _load_ratings(self, file_name):
user_dict = dict()
inter_mat = list()
lines = open(file_name, 'r').readlines()
for l in lines:
tmps = l.strip()
inters = [int(i) for i in tmps.split(' ')]
u_id, pos_ids = inters[0], inters[1:]
pos_ids = list(set(pos_ids))
for i_id in pos_ids:
inter_mat.append([u_id, i_id])
if len(pos_ids) > 0:
user_dict[u_id] = pos_ids
return np.array(inter_mat), user_dict
def _statistic_ratings(self):
self.n_users = max(max(self.train_data[:, 0]), max(self.test_data[:, 0])) + 1
self.n_items = max(max(self.train_data[:, 1]), max(self.test_data[:, 1])) + 1
self.n_train = len(self.train_data)
self.n_test = len(self.test_data)
# reading train & test interaction data.
def _load_kg(self, file_name):
def _construct_kg(kg_np):
#collections.defaultdict生成一种字典类型
kg = collections.defaultdict(list)
rd = collections.defaultdict(list)
for head, relation, tail in kg_np:
kg[head].append((tail, relation))
rd[relation].append((head, tail))
return kg, rd
kg_np = np.loadtxt(file_name, dtype=np.int32)
kg_np = np.unique(kg_np, axis=0)
# self.n_relations = len(set(kg_np[:, 1]))
# self.n_entities = len(set(kg_np[:, 0]) | set(kg_np[:, 2]))
#关系的最大值:n_relations
#实体的最大值:n_entities
#三元组的个数:n_triples
self.n_relations = max(kg_np[:, 1]) + 1
self.n_entities = max(max(kg_np[:, 0]), max(kg_np[:, 2])) + 1
self.n_triples = len(kg_np)
#kg_np是直接读出来的kg_final原始数据
#kg_dict:以head为键,(tail,relation)为值
#relation_dict:以relation为键,(head,tail)为值
kg_dict, relation_dict = _construct_kg(kg_np)
return kg_np, kg_dict, relation_dict
#输出数据信息
def _print_data_info(self):
print('[n_users, n_items]=[%d, %d]' % (self.n_users, self.n_items))
print('[n_train, n_test]=[%d, %d]' % (self.n_train, self.n_test))
print('[n_entities, n_relations, n_triples]=[%d, %d, %d]' % (self.n_entities, self.n_relations, self.n_triples))
print('[batch_size, batch_size_kg]=[%d, %d]' % (self.batch_size, self.batch_size_kg))
#返回一个满足batch_size大小的users
#和每个user对应的正样本(按照user的顺序保存在pos_items中)
#和负样本(按照user的顺序保存在neg_items中)
def _generate_train_cf_batch(self):
#生成batch_size大小的用户列表
if self.batch_size <= self.n_users:
#从列表中按照给定参数抽样,在原列表中的顺序不变,返回一个新的列表
users = rd.sample(self.exist_users, self.batch_size)
else:
#rd.choice 从列表中随机选一个,后面跟着一个for循环,就是在exist_users中进行batch_size次的抽样
users = [rd.choice(self.exist_users) for _ in range(self.batch_size)]
#获取用户u指定大小为num的正样本集合
def sample_pos_items_for_u(u, num):
#pos_item是u的正样本组成的集合
pos_items = self.train_user_dict[u]
#n_pos_item 是pos_item的个数,也就是该用户u的正样本个数
n_pos_items = len(pos_items)
#pos_batch是用户u指定大小为num的正样本集合,从train_user_dict根据键值为u取出来的pos构造的
pos_batch = []
while True:
#获取大小为指定的num的子集
if len(pos_batch) == num: break
#pos_id:在大小n_pos_items中随机抽取一个数,也就是位置
pos_id = np.random.randint(low=0, high=n_pos_items, size=1)[0]
#pos_i_id是pos_item中这个pos_id对应的itemid
pos_i_id = pos_items[pos_id]
#如果上面抽取的pos_i_id不在已有的pos_batch中,则把它加入进去
if pos_i_id not in pos_batch:
pos_batch.append(pos_i_id)
return pos_batch
#获取用户u的大小为num的负样本集合
def sample_neg_items_for_u(u, num):
#neg_items作为装载负样本的列表
neg_items = []
while True:
if len(neg_items) == num: break
#n_items为用户id最大值,neg_i_id为随机取的pos index
neg_i_id = np.random.randint(low=0, high=self.n_items,size=1)[0]
#判断:如果neg_i_id不在u的正样本列表里,也不在已经采集好的neg_items中
#那么把他添加到neg_items中
if neg_i_id not in self.train_user_dict[u] and neg_i_id not in neg_items:
neg_items.append(neg_i_id)
return neg_items
#对于每个用户u都采集相应的正样本和负样本并最终添加到pos_items和neg_items中
pos_items, neg_items = [], []
for u in users:
pos_items += sample_pos_items_for_u(u, 1)
neg_items += sample_neg_items_for_u(u, 1)
#返回一个满足batch_size大小的users和每个user对应的正样本(按照user的顺序保存在pos_items中)和负样本(按照user的顺序保存在neg_items中)
return users, pos_items, neg_items
def get_sparsity_split(self):
try:
split_uids, split_state = [], []
lines = open(self.path + '/sparsity.split', 'r').readlines()
for idx, line in enumerate(lines):
if idx % 2 == 0:
split_state.append(line.strip())
print(line.strip())
else:
split_uids.append([int(uid) for uid in line.strip().split(' ')])
print('get sparsity split.')
except Exception:
split_uids, split_state = self.create_sparsity_split()
f = open(self.path + '/sparsity.split', 'w')
for idx in range(len(split_state)):
f.write(split_state[idx] + '\n')
f.write(' '.join([str(uid) for uid in split_uids[idx]]) + '\n')
print('create sparsity split.')
return split_uids, split_state
def create_sparsity_split(self):
all_users_to_test = list(self.test_user_dict.keys())
user_n_iid = dict()
# generate a dictionary to store (key=n_iids, value=a list of uid).
for uid in all_users_to_test:
train_iids = self.train_user_dict[uid]
test_iids = self.test_user_dict[uid]
n_iids = len(train_iids) + len(test_iids)
if n_iids not in user_n_iid.keys():
user_n_iid[n_iids] = [uid]
else:
user_n_iid[n_iids].append(uid)
split_uids = list()
# split the whole user set into four subset.
temp = []
count = 1
fold = 4
n_count = (self.n_train + self.n_test)
n_rates = 0
split_state = []
for idx, n_iids in enumerate(sorted(user_n_iid)):
temp += user_n_iid[n_iids]
n_rates += n_iids * len(user_n_iid[n_iids])
n_count -= n_iids * len(user_n_iid[n_iids])
if n_rates >= count * 0.25 * (self.n_train + self.n_test):
split_uids.append(temp)
state = '#inter per user<=[%d], #users=[%d], #all rates=[%d]' %(n_iids, len(temp), n_rates)
split_state.append(state)
print(state)
temp = []
n_rates = 0
fold -= 1
if idx == len(user_n_iid.keys()) - 1 or n_count == 0:
split_uids.append(temp)
state = '#inter per user<=[%d], #users=[%d], #all rates=[%d]' % (n_iids, len(temp), n_rates)
split_state.append(state)
print(state)
return split_uids, split_state
'''
Created on Dec 18, 2018
Tensorflow Implementation of the Baseline Model, BPRMF, in:
Wang Xiang et al. KGAT: Knowledge Graph Attention Network for Recommendation. In KDD 2019.
@author: Xiang Wang (xiangwang@u.nus.edu)
'''
from utility.load_data import Data
#创建BPRMF数据加载类,继承自Data类
class BPRMF_loader(Data):
def __init__(self, args, path):
#继承了父类的init初始化
super().__init__(args, path)
#获得train_batch
#生成batch_data(字典),键为users、pos_items,neg_items,
#与Data一样,值为Data生成的大小为batch_size的用户列表,以及每个用户的正样本集合与负样本集合
#返回这个batch_data的字典
def generate_train_batch(self):
users, pos_items, neg_items = self._generate_train_cf_batch()
batch_data = {}
batch_data['users'] = users
batch_data['pos_items'] = pos_items
batch_data['neg_items'] = neg_items
return batch_data
#生成tf的BPRMFfeed_dict
#将batch_data的信息匹配到feed_dict中
def generate_train_feed_dict(self, model, batch_data):
feed_dict = {
model.users: batch_data['users'],
model.pos_items: batch_data['pos_items'],
model.neg_items: batch_data['neg_items']
}
return feed_dict
#生成测试的feed_dict
def generate_test_feed_dict(self, model, user_batch, item_batch, drop_flag=False):
feed_dict = {
model.users: user_batch,
model.pos_items: item_batch
}
return feed_dict
今天没有总结