ComplEX是Distmul的改进。
前提:采用Bilinear Model,该模型的打分函数为f (h,t)= hT·r·t
基础假设:采用复数u = a + bi的方式表示h和t。
打分函数:因此,对于该function,f (h,t)= hT·r·(t),t指的是t复数域上的共轭复数。
反对称性:通过对 hT·r·(~t)的高打分+ (~h)T·r·t低打分训练
对称性:通过把虚部设置为0即可
传递性:不满足,可以通过点积的性质进行简单证明,
传递性建模为:如果有a·b和b·c,是否可以推算出a·c。用(x1,y1)表示a,(x2,y2)表示b,(x3,y3)表示c。随意假设数值,因此有公式如下:
x1x2 + y1y2 = 1
x2x3 + y2y3 = 2
这两个公式无法计算出x1x3+y1y3。
一对多:t1和t2在h上的投影大小相同即可。
The Prowler starred_actors Evelyn Keyes
Robinson Crusoe in_language English
Memory starred_actors Billy Zane
The Wrath of God starred_actors Robert Mitchum
The Departed has_tags police
Kismet in_language English
Hoodwinked! has_tags children
Village of the Damned has_tags remake
Reel Injun written_by Neil Diamond
The Parent Trap starred_actors Dennis Quaid
Tarzan has_tags disney animated feature
Yellow Sky has_genre Western
The Happening release_year 2008
April Fool's Day has_genre Horror
Vantage Point has_tags assassination
A Patch of Blue starred_actors Elizabeth Hartman
Brian's Song starred_actors Jack Warden
...
...
...
model.py
import numpy as np
import torch
from torch.nn.init import xavier_normal_
import torch.nn as nn
import torch.nn.functional as F
# 用于将知识图谱的实体、关系转为Embedding
class EmbedModel(torch.nn.Module):
def __init__(self, d, ent_vec_dim, rel_vec_dim, **kwargs):
super(EmbedModel, self).__init__()
self.model_name = kwargs["model_name"]
multiplier = 2
self.loss_type = kwargs['loss_type']
if self.loss_type == 'BCE':
self.loss = self.bce_loss # self.loss = torch.nn.BCELoss()
self.bce_loss_loss = torch.nn.BCELoss()
elif self.loss_type == 'CE':
self.loss = self.ce_loss
else:
# print('Incorrect loss specified:', self.loss_type)
exit(0)
self.model = self.ComplEx
self.E_Embedding = torch.nn.Embedding(len(d.entities), ent_vec_dim * multiplier, padding_idx=0)
self.R_Embedding = torch.nn.Embedding(len(d.relations), ent_vec_dim * multiplier, padding_idx=0)
self.entity_dim = ent_vec_dim * multiplier
self.do_batch_norm = True
if kwargs["do_batch_norm"] == False:
self.do_batch_norm = False
self.input_dropout = torch.nn.Dropout(kwargs["input_dropout"])
self.hidden_dropout1 = torch.nn.Dropout(kwargs["hidden_dropout1"])
self.hidden_dropout2 = torch.nn.Dropout(kwargs["hidden_dropout2"])
self.l3_reg = kwargs["l3_reg"]
# BatchNorm1d参数
# 输入维度是(N, C, L)时,num_features应该取C;这里N是batch size,C是数据的channel,L是数据长度。
# 输入维度是(N, L)时,num_features应该取L;这里N是batch size,L是数据长度,这时可以认为每条数据只有一个channel,省略了C
self.bn0 = torch.nn.BatchNorm1d(num_features=multiplier)
self.bn1 = torch.nn.BatchNorm1d(num_features=multiplier)
self.bn2 = torch.nn.BatchNorm1d(num_features=multiplier)
self.logsoftmax = torch.nn.LogSoftmax(dim=-1)
def init(self):
xavier_normal_(self.E_Embedding.weight.data)
xavier_normal_(self.R_Embedding.weight.data)
def freeze_entity_embeddings(self):
self.E_Embedding.weight.requires_grad = False
def ce_loss(self, pred, true):
pred = F.log_softmax(pred, dim=-1)
true = true / true.size(-1)
loss = -torch.sum(pred * true)
return loss
def bce_loss(self, pred, true):
loss = self.bce_loss_loss(pred, true)
# l3 regularization
if self.l3_reg:
norm = torch.norm(self.E_Embedding.weight.data, p=3, dim=-1)
loss += self.l3_reg * torch.sum(norm)
return loss
def ComplEx(self, head, relation): # head.shape = torch.Size([batch_size, 400]); relation.shape = torch.Size([batch_size, 400])
heads_tuple = torch.chunk(head, 2, dim=1) # heads[0].shape = torch.Size([8, 200])
# print("model.py---->ComplEx---->heads_tuple[0].shape = {0}".format(heads_tuple[0].shape))
head = torch.stack(list(heads_tuple), dim=1) # torch.Size([8, 400])---->torch.Size([8, 2, 200])
if self.do_batch_norm:
head = self.bn0(head)
head = self.input_dropout(head)
head = head.permute(1, 0, 2) # torch.Size([8, 2, 200])---->torch.Size([2, 8, 200])
# print("model.py---->ComplEx---->head.shape = {0}".format(head.shape))
re_head = head[0] # re_head.shape = torch.Size([8, 200])
im_head = head[1] # im_head.shape = torch.Size([8, 200])
# print("model.py---->ComplEx---->re_head.shape = {0}; im_head.shape = {1}".format(re_head.shape, im_head.shape))
relation = self.hidden_dropout1(relation) # relation.shape = torch.Size([8, 400])
# print("model.py---->ComplEx---->relation.shape = {0}".format(relation.shape))
re_relation, im_relation = torch.chunk(relation, 2, dim=1) # re_relation.shape = torch.Size([8, 200]); im_relation.shape = torch.Size([8, 200])
# print("model.py---->ComplEx---->re_relation.shape = {0}; im_relation.shape = {1}".format(re_relation.shape, im_relation.shape))
re_tail, im_tail = torch.chunk(self.E_Embedding.weight, 2, dim=1) # re_tail.shape = torch.Size([14541, 200]); im_tail.shape = torch.Size([14541, 200])
# print("model.py---->ComplEx---->re_tail.shape = {0}; im_tail.shape = {1}".format(re_tail.shape, im_tail.shape))
re_score = re_head * re_relation - im_head * im_relation # re_score.shape = torch.Size([8, 200])
im_score = re_head * im_relation + im_head * re_relation # im_score.shape = torch.Size([8, 200])
# print("model.py---->ComplEx---->re_score.shape = {0}; im_score.shape = {1}".format(re_score.shape, im_score.shape))
score = torch.stack([re_score, im_score], dim=1) # score.shape = torch.Size([8, 2, 200])
# print("model.py---->ComplEx---->score.shape = {0}".format(score.shape))
if self.do_batch_norm:
score = self.bn2(score)
score = self.hidden_dropout2(score)
score = score.permute(1, 0, 2) # score.shape = torch.Size([2, 8, 200])
# print("model.py---->ComplEx---->score.shape = {0}".format(score.shape))
re_score = score[0] # re_score.shape = torch.Size([8, 200])
im_score = score[1] # im_score.shape = torch.Size([8, 200])
score = torch.mm(re_score, re_tail.transpose(1, 0)) + torch.mm(im_score, im_tail.transpose(1, 0)) # score.shape = torch.Size([8, 14541])
# print("model.py---->ComplEx---->score.shape = {0}".format(score.shape))
return score
# e1_idx: 一个batch的头实体ids; tensor([12711, 1016, 11215, 5200, 6072, 8968, 11427, 13015], device='cuda:0')
# r_idx: 一个batch的关系ids; tensor([382, 384, 372, 433, 319, 100, 281, 376], device='cuda:0')
def forward(self, e1_idx, r_idx):
e1 = self.E_Embedding(e1_idx)
r = self.R_Embedding(r_idx)
# print("model.py---->forward---->e1.shape = {0}; r.shape = {1}".format(e1.shape, r.shape)) # e1.shape = torch.Size([batch_size, 400]); r.shape = torch.Size([batch_size, 400])
ans = self.model(head=e1, relation=r)
pred = torch.sigmoid(ans)
return pred
load_data.py
class Data:
def __init__(self, data_dir=None, reverse=False):
self.train_data = self.load_data(data_dir, "train", reverse=reverse) # train数据集的所有三元组: (head, relation, tail)
self.valid_data = self.load_data(data_dir, "valid", reverse=reverse) # valid数据集的所有三元组: (head, relation, tail)
self.test_data = self.load_data(data_dir, "test", reverse=reverse) # test数据集的所有三元组: (head, relation, tail)
self.data = self.train_data + self.valid_data + self.test_data # 数据集中所有三元组: (head, relation, tail)
self.entities = self.get_entities(self.data) # 数据集中所有实体
print("load_data.py---->数据集中实体总数量:len(self.entities) = ", len(self.entities))
self.train_relations = self.get_relations(self.train_data) # train数据集的所有三元组中的关系
self.valid_relations = self.get_relations(self.valid_data) # valid数据集的所有三元组中的关系
self.test_relations = self.get_relations(self.test_data) # test数据集的所有三元组中的关系
self.relations = self.train_relations + [i for i in self.valid_relations if i not in self.train_relations] + [i for i in self.test_relations if i not in self.train_relations] # 数据集中所有关系
print("load_data.py---->数据集中关系总数量:len(self.relations) = ", len(self.relations))
def load_data(self, data_dir, data_type="train", reverse=False):
file_path = "%s%s.txt" % (data_dir, data_type) # file_path = data/FB15k-237/train.txt
print("data_dir = {0}; data_type = {1}; file_path = {2}".format(data_dir, data_type, file_path))
with open(file_path, "r") as f:
data = f.read().strip().split("\n")
data = [i.split('\t') for i in data]
# 将三元组关系进行翻转,训练数量翻倍
if reverse:
data += [[i[2], i[1]+"_reverse", i[0]] for i in data]
return data
# 获取data中所有三元组中的实体
def get_entities(self, data):
entities = sorted(list(set([d[0] for d in data]+[d[2] for d in data])))
return entities
# 获取data中所有三元组中的关系
def get_relations(self, data):
relations = sorted(list(set([d[1] for d in data])))
return relations
trainer.py
import numpy as np
import torch
import time
from collections import defaultdict
from model import *
from torch.optim.lr_scheduler import ExponentialLR
from tqdm import tqdm
import os
class Trainer:
def __init__(self, d=None, learning_rate=0.0005, ent_vec_dim=200, rel_vec_dim=200, num_iterations=500, batch_size=128, decay_rate=0., cuda=False,
input_dropout=0.3, hidden_dropout1=0.4, hidden_dropout2=0.5, label_smoothing=0., outfile='tucker.model', valid_steps=1,
loss_type='BCE', do_batch_norm=1, dataset_name='', model_name='ComplEx', l3_reg=0.0, load_from=''):
self.d = d # 所有数据集(train数据集、valid数据集、test数据集)
self.dataset_name = dataset_name # 数据集名称
self.learning_rate = learning_rate
self.ent_vec_dim = ent_vec_dim # 实体embedding之后的维度
self.rel_vec_dim = rel_vec_dim # 关系embedding之后的维度
self.num_epochs = num_iterations
self.batch_size = batch_size
self.decay_rate = decay_rate
self.label_smoothing = label_smoothing # 标签平滑
self.cuda = cuda
self.outfile = outfile
self.valid_steps = valid_steps
self.model_name = model_name
self.l3_reg = l3_reg
self.loss_type = loss_type
self.load_from = load_from
if do_batch_norm == 1:
do_batch_norm = True
else:
do_batch_norm = False
self.kwargs = {"input_dropout": input_dropout, "hidden_dropout1": hidden_dropout1, "hidden_dropout2": hidden_dropout2, "model_name": model_name, "loss_type": loss_type, "do_batch_norm": do_batch_norm, "l3_reg": l3_reg}
# 将三元组转换为id的形式:['/m/027rn', '/location/country/fo...government', '/m/06cx9']---->(3818, 244, 8942)
def get_data_idxs(self, data):
data_idxs = [(self.entity2idxs[data[i][0]], self.relation2idxs[data[i][1]], self.entity2idxs[data[i][2]]) for i in range(len(data))]
return data_idxs
# 获取给定(头实体, 关系)的所有尾实体:(head, relation):[tail01, tail02...]
def get_er_vocab(self, data):
er_vocab = defaultdict(list)
for triple in data:
er_vocab[(triple[0], triple[1])].append(triple[2])
return er_vocab
# 获取一个batch的数据
def get_batch(self, er_vocab, er_vocab_pairs, batch_idx):
batch = er_vocab_pairs[batch_idx:batch_idx + self.batch_size] # batch_size = 128
batch_size = len(batch)
num_entities = len(self.d.entities)
targets = torch.zeros([batch_size, num_entities], dtype=torch.float32) # targets.shape = torch.Size([128, 14541])
# print("\ntrain_embeddings---->trainer.py---->get_batch---->targets.shape = ", targets.shape)
if self.cuda:
targets = targets.cuda()
for batch_idx, pair in enumerate(batch):
target_entities_idx = er_vocab[pair]
targets[batch_idx, target_entities_idx] = 1.
return np.array(batch), targets
def train_and_eval(self):
print("\ntrain_embeddings---->trainer.py---->train_and_eval:")
torch.set_num_threads(2)
best_valid = [0, 0, 0, 0, 0]
best_test = [0, 0, 0, 0, 0]
num_entities = len(self.d.entities)
num_relations = len(self.d.relations)
self.entity2idxs = {self.d.entities[i]: i for i in range(num_entities)} # 实体、id映射表
self.relation2idxs = {self.d.relations[i]: i for i in range(num_relations)} # 关系、id映射表
# 将实体与id对应的字典保存
with open('data/' + self.dataset_name + '/entities.dict', 'w') as f:
for key, value in self.entity2idxs.items():
f.write(key + '\t' + str(value) + '\n')
# 将关系与id对应的字典保存
with open('data/' + self.dataset_name + '/relations.dict', 'w') as f:
for key, value in self.relation2idxs.items():
f.write(key + '\t' + str(value) + '\n')
print("trainer.py---->train_and_eval---->len(self.d.train_data) = ", len(self.d.train_data)) # 544230(已经将train.txt中的272115个三元组进行翻转,训练数量翻倍)
# 将三元组转换为id的形式:['/m/027rn', '/location/country/fo...government', '/m/06cx9']---->(3818, 244, 8942)
train_data_idxs = self.get_data_idxs(self.d.train_data) # train_data_idxs = [(3818, 244, 8942), (819, 460, 9234), (9791, 280, 756), (2522, 24, 7022),...]
print("Number of training data points: %d" % len(train_data_idxs)) # 544230(三元组的数量)
print('Entities: %d' % len(self.entity2idxs)) # 14541
print('Relations: %d' % len(self.relation2idxs)) # 474
# 初始化模型(d: 通过Data.py加载的数据; )
model = EmbedModel(self.d, self.ent_vec_dim, self.rel_vec_dim, **self.kwargs)
model.init() # 初始化参数
# 加载已经保存的模型参数
if self.load_from != '':
fname = self.load_from
checkpoint = torch.load(fname)
model.load_state_dict(checkpoint)
# 使用GPU
if self.cuda:
model.cuda()
# 初始化优化器
opt = torch.optim.Adam(model.parameters(), lr=self.learning_rate)
# 设置学习率
if self.decay_rate:
scheduler = ExponentialLR(opt, self.decay_rate)
er_vocab = self.get_er_vocab(train_data_idxs) # 获取给定(头实体, 关系)的所有尾实体:(head, relation):[tail01, tail02...]; 一共149689个
er_vocab_pairs = list(er_vocab.keys()) # 所有的 (head, relation)
print("Starting training...")
# 开始训练epoch
for epoch_idx in range(1, self.num_epochs + 1):
start_train = time.time()
model.train()
losses = []
np.random.shuffle(er_vocab_pairs)
# 训练batch
for j in tqdm(range(0, len(er_vocab_pairs), self.batch_size)):
# print("trainer.py---->train_and_eval---->j = ", j)
# 获取一个batch的训练数据
X, labels = self.get_batch(er_vocab, er_vocab_pairs, j)
# 梯度置零
opt.zero_grad()
# 获取头实体、关系
e1_idx, r_idx = torch.tensor(X[:, 0]), torch.tensor(X[:, 1]) # 头实体的id, 关系的id
if self.cuda:
e1_idx = e1_idx.cuda()
r_idx = r_idx.cuda()
# 利用模型根据(头实体,关系)预测(尾实体)
predictions = model.forward(e1_idx, r_idx)
if self.label_smoothing:
labels = ((1.0 - self.label_smoothing) * labels) + (1.0 / labels.size(1))
# 计算loss
loss = model.loss(predictions, labels) # predictions.shape = torch.Size([8, 14541]) labels.shape = torch.Size([8, 14541])
# 梯度反向传播
loss.backward()
# 更新参数
opt.step()
losses.append(loss.item())
# 每一个epoch进行一个学习率调整
if self.decay_rate:
scheduler.step()
# 每100个epoch打印一次
if epoch_idx % 100 == 0:
print('Epoch', epoch_idx, ' Epoch time', time.time() - start_train, ' Loss:', np.mean(losses))
# 每一个epoch进行一次验证、测试
model.eval()
with torch.no_grad():
if epoch_idx % self.valid_steps == 0:
start_test = time.time()
print("\n\n开始验证-Valid:")
valid = self.evaluate(model, self.d.valid_data)
print("\n\n开始测试-Test:")
test = self.evaluate(model, self.d.test_data)
valid_mrr = valid[0]
test_mrr = test[0]
if valid_mrr >= best_valid[0]:
best_valid = valid
best_test = test
print('Validation MRR increased.')
print('Saving model...')
self.write_embedding_files(model)
print('Model saved!')
print('Best valid:', best_valid)
print('Best Test:', best_test)
print('Dataset:', self.dataset_name)
print('Model:', self.model_name)
print(time.time() - start_test)
print('Learning rate %f | Decay %f | Dim %d | Input drop %f | Hidden drop 2 %f | LS %f | Batch size %d | Loss type %s | L3 reg %f' % (self.learning_rate,
self.decay_rate,
self.ent_vec_dim,
self.kwargs["input_dropout"],
self.kwargs["hidden_dropout2"],
self.label_smoothing,
self.batch_size,
self.loss_type,
self.l3_reg))
# 验证
def evaluate(self, model, data):
print("train_embeddings---->trainer.py---->evaluate:")
model.eval()
hits = [[] for _ in range(10)]
ranks = []
# 将所有valid/test数据集三元组转换为id的形式:['/m/027rn', '/location/country/fo...government', '/m/06cx9']---->(3818, 244, 8942)
test_data_idxs = self.get_data_idxs(data) # 35070
# 获取给定(头实体, 关系)的所有尾实体:(head, relation):[tail01, tail02...]; (5304, 8):[7793, 8554, 1084]
er_vocab = self.get_er_vocab(test_data_idxs)
print("Number of data points: %d" % len(test_data_idxs)) # 35070
# 按照batch_size进行迭代
len_test_data_idxs = len(test_data_idxs) # 35070
for i in tqdm(range(0, len_test_data_idxs, self.batch_size)):
data_batch = np.array(test_data_idxs[i: i + self.batch_size]) # 当前batch的所有数据 data_batch.shape = (batch_size, 3)
print("trainer.py---->evaluate---->data_batch.shape = {0}; data_batch = \n{1}".format(data_batch.shape, data_batch))
e1_idx = torch.tensor(data_batch[:, 0]) # 头实体 tensor([ 9738, 9271, 2570, 5304, 9589, 12527, 8687, 2560])
r_idx = torch.tensor(data_batch[:, 1]) # 关系 tensor([170, 262, 392, 8, 192, 190, 456, 46])
e2_idx = torch.tensor(data_batch[:, 2]) # 尾实体 tensor([4553, 4280, 7855, 7793, 7413, 3942, 4366, 8369])
if self.cuda:
e1_idx = e1_idx.cuda()
r_idx = r_idx.cuda()
e2_idx = e2_idx.cuda()
# 将(头实体,关系)输入模型,预测尾实体
predictions = model.forward(e1_idx, r_idx) # predictions.shape = torch.Size([batch_size, 14541])
print("trainer.py---->evaluate---->predictions.shape = ", predictions.shape)
# following lines commented means RAW evaluation (not filtered)
batch_size = data_batch.shape[0]
for i in range(batch_size):
e1_r_idx = (data_batch[i][0], data_batch[i][1]) # (5304, 8)
filt = er_vocab[e1_r_idx] # 当前三元组的(头实体,关系)对应的所有尾实体的index:[7793, 8554, 1084] 根据给定(头实体, 关系)获取所有尾实体:(head, relation):[tail01, tail02...]; (5304, 8):[7793, 8554, 1084]
e2_idx_i = e2_idx[i] # 当前三元组样本中真实尾实体的index:7793
target_value = predictions[i, e2_idx_i].item() # 预测得到的真实的尾实体的概率 0.17887896299362183
# 将(头实体, 关系)对应所有尾实体处的概率先置零,只留当前三元组样本中的尾实体概率
predictions[i, filt] = 0.0
predictions[i, e2_idx_i] = target_value
# 通过sort获取概率最大的尾实体的index
sort_values, sort_idxs = torch.sort(predictions, dim=1, descending=True) # dim=1:为预测尾实体在所有14541候选实体上的概率值
sort_idxs = sort_idxs.cpu().numpy() # array([[ 9791, 8454, 4553, ..., 2466, 2058, 8743],
for i in range(batch_size):
sort_idxs_i = sort_idxs[i] # 当前样本的预测index根据概率值从大到小排序后的排序列表
e2_idx_i = e2_idx[i].item() # 当前样本的真实尾实体的index 4553
filt_tuple = np.where(sort_idxs_i == e2_idx_i) # 当np.where()内只有一个参数时,那个参数表示条件,当条件成立时,where返回的是每个符合condition条件元素的坐标,返回的是以元组的形式 (array([2]),)
rank = filt_tuple[0][0]
ranks.append(rank + 1)
# 获取hits数据
for hits_level in range(10):
if rank <= hits_level:
hits[hits_level].append(1.0)
else:
hits[hits_level].append(0.0)
print("trainer.py---->evaluate---->len(hits) = ", len(hits)) # (10, 35070)
# 分别计算hitat1、hitat3、hitat10、meanrank、mrr
hitat10 = np.mean(hits[9]) # 0.24103222127174223
hitat3 = np.mean(hits[2]) # 0.15260906757912746
hitat1 = np.mean(hits[0]) # 0.09994297120045623
meanrank = np.mean(ranks) # 2097.5095238095237
mrr = np.mean(1. / np.array(ranks)) # 0.14575755271227386
print('Hits @10: {0}'.format(hitat10))
print('Hits @3: {0}'.format(hitat3))
print('Hits @1: {0}'.format(hitat1))
print('Mean rank: {0}'.format(meanrank))
print('Mean reciprocal rank: {0}'.format(mrr))
return [mrr, meanrank, hitat10, hitat3, hitat1]
# 保存实体、关系的Embedding
def write_embedding_files(self, model):
print("\ntrain_embeddings---->trainer.py---->write_embedding_files:")
model.eval()
model_folder = "kg_embeddings/%s/" % self.dataset_name # 'kg_embeddings/FB15k-237/'
data_folder = "data/%s/" % self.dataset_name # 'data/FB15k-237/'
print("\ntrain_embeddings---->trainer.py---->write_embedding_files---->model_folder = ", model_folder)
print("\ntrain_embeddings---->trainer.py---->write_embedding_files---->data_folder = ", data_folder)
embedding_type = self.model_name
if os.path.exists(model_folder) == False:
print("创建目录: ", model_folder)
os.makedirs(model_folder)
E_numpy = model.E_Embedding.weight.data.cpu().numpy()
R_numpy = model.R_Embedding.weight.data.cpu().numpy()
print("train_embeddings---->trainer.py---->write_embedding_files----E_numpy.shape = ", E_numpy.shape)
print("train_embeddings---->trainer.py---->write_embedding_files----R_numpy.shape = ", R_numpy.shape)
bn_list = []
for bn in [model.bn0, model.bn1, model.bn2]:
bn_weight = bn.weight.data.cpu().numpy()
bn_bias = bn.bias.data.cpu().numpy()
bn_running_mean = bn.running_mean.data.cpu().numpy()
bn_running_var = bn.running_var.data.cpu().numpy()
bn_numpy = {}
bn_numpy['weight'] = bn_weight
bn_numpy['bias'] = bn_bias
bn_numpy['running_mean'] = bn_running_mean
bn_numpy['running_var'] = bn_running_var
bn_list.append(bn_numpy)
np.save(model_folder + '/E.npy', E_numpy) # 保存实体的Embedding
np.save(model_folder + '/R.npy', R_numpy) # 保存关系的Embedding
# 保存BatchNorm参数
for i, bn in enumerate(bn_list):
np.save(model_folder + '/bn' + str(i) + '.npy', bn)
if embedding_type == 'TuckER':
W_numpy = model.W.detach().cpu().numpy()
np.save(model_folder + '/W.npy', W_numpy) # 保存权重
# ------------------------------------------------------------ 拷贝dict数据 ------------------------------------------------------------
# 将数据集文件夹中的entities.dict拷贝到目标文件夹中
f1 = open(data_folder + '/entities.dict', 'r')
f2 = open(model_folder + '/entities.dict', 'w')
ents = {}
idx2ent = {}
for line in f1:
line = line.rstrip().split('\t')
name = line[0]
id = int(line[1])
ents[name] = id
idx2ent[id] = name
f2.write(str(id) + '\t' + name + '\n')
f1.close()
f2.close()
# 将数据集文件夹中的relations.dict拷贝到目标文件夹中
f1 = open(data_folder + '/relations.dict', 'r')
f2 = open(model_folder + '/relations.dict', 'w')
rels = {}
idx2rel = {}
for line in f1:
line = line.strip().split('\t')
name = line[0]
id = int(line[1])
rels[name] = id
idx2rel[id] = name
f2.write(str(id) + '\t' + name + '\n')
f1.close()
f2.close()
utils.py
import os
import torch
import numpy as np
import random
def seed_everything(seed=1029):
'''
设置整个开发环境的seed
:param seed:
:param device:
:return:
'''
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available:
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# some cudnn methods can be random even after fixing the seed unless you tell it to be deterministic
torch.backends.cudnn.deterministic = True
main.py
import os
from load_data import Data
import torch
from model import *
from trainer import Trainer
import argparse
from utils import seed_everything
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--dataset_name", type=str, default="MetaQA", nargs="?", help="Which dataset to use: FB15k, FB15k-237, MetaQA, WN18 or WN18RR.")
parser.add_argument("--num_iterations", type=int, default=500, nargs="?", help="Number of iterations.")
parser.add_argument("--batch_size", type=int, default=5120, nargs="?", help="Batch size.") # 128
parser.add_argument("--lr", type=float, default=0.0005, nargs="?", help="Learning rate.")
parser.add_argument("--model", type=str, default='ComplEx', nargs="?", help="Model.") # Rotat3
parser.add_argument("--dr", type=float, default=1.0, nargs="?", help="Decay rate.")
parser.add_argument("--edim", type=int, default=200, nargs="?", help="Entity embedding dimensionality.")
parser.add_argument("--rdim", type=int, default=200, nargs="?", help="Relation embedding dimensionality.")
parser.add_argument("--cuda", type=bool, default=True, nargs="?", help="Whether to use cuda (GPU) or not (CPU).")
parser.add_argument("--input_dropout", type=float, default=0.3, nargs="?", help="Input layer dropout.")
parser.add_argument("--hidden_dropout1", type=float, default=0.4, nargs="?", help="Dropout after the first hidden layer.")
parser.add_argument("--hidden_dropout2", type=float, default=0.5, nargs="?", help="Dropout after the second hidden layer.")
parser.add_argument("--label_smoothing", type=float, default=0.1, nargs="?", help="Amount of label smoothing.")
parser.add_argument("--outfile", type=str, default='tucker.model', nargs="?", help="File to save")
parser.add_argument("--valid_steps", type=int, default=1, nargs="?", help="Epochs before u validate")
parser.add_argument("--loss_type", type=str, default='BCE', nargs="?", help="Loss type")
parser.add_argument("--do_batch_norm", type=int, default=1, nargs="?", help="Do batch norm or not (0, 1)")
parser.add_argument("--l3_reg", type=float, default=0.0, nargs="?", help="l3 reg hyperparameter")
parser.add_argument("--load_from", type=str, default='', nargs="?", help="load from state dict")
args = parser.parse_args()
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
dataset_name = args.dataset_name
data_dir = "data/%s/" % dataset_name # data_dir = data/MetaQA/
print("\ntrain_embeddings---->main.py---->data_dir = ", data_dir)
# 设置统一的随机种子
seed_everything()
# 读取并构建数据
d = Data(data_dir=data_dir, reverse=True)
trainer = Trainer(d=d,
num_iterations=args.num_iterations,
batch_size=args.batch_size,
learning_rate=args.lr,
decay_rate=args.dr,
ent_vec_dim=args.edim,
rel_vec_dim=args.rdim,
cuda=args.cuda,
input_dropout=args.input_dropout,
hidden_dropout1=args.hidden_dropout1,
hidden_dropout2=args.hidden_dropout2,
label_smoothing=args.label_smoothing,
outfile=args.outfile,
valid_steps=args.valid_steps,
loss_type=args.loss_type,
do_batch_norm=args.do_batch_norm,
dataset_name=args.dataset_name,
model_name=args.model,
l3_reg=args.l3_reg,
load_from=args.load_from)
trainer.train_and_eval()
知识图谱的几个经典模型:TransE、Trans R、ComplEx、ConvKB_0x3fffffff的博客-CSDN博客_知识图谱模型