知识图谱-KGE-双线性模型-2016:ComplEx

ComplEX是Distmul的改进。
前提:采用Bilinear Model,该模型的打分函数为f (h,t)= hT·r·t
基础假设:采用复数u = a + bi的方式表示h和t。
打分函数:因此,对于该function,f (h,t)= hT·r·(t),t指的是t复数域上的共轭复数。

反对称性:通过对 hT·r·(~t)的高打分+ (~h)T·r·t低打分训练
对称性:通过把虚部设置为0即可
传递性:不满足,可以通过点积的性质进行简单证明,
传递性建模为:如果有a·b和b·c,是否可以推算出a·c。用(x1,y1)表示a,(x2,y2)表示b,(x3,y3)表示c。随意假设数值,因此有公式如下:
x1x2 + y1y2 = 1
x2x3 + y2y3 = 2
这两个公式无法计算出x1x3+y1y3。
一对多:t1和t2在h上的投影大小相同即可。
 

一、MetaQA数据集

The Prowler	starred_actors	Evelyn Keyes
Robinson Crusoe	in_language	English
Memory	starred_actors	Billy Zane
The Wrath of God	starred_actors	Robert Mitchum
The Departed	has_tags	police
Kismet	in_language	English
Hoodwinked!	has_tags	children
Village of the Damned	has_tags	remake
Reel Injun	written_by	Neil Diamond
The Parent Trap	starred_actors	Dennis Quaid
Tarzan	has_tags	disney animated feature
Yellow Sky	has_genre	Western
The Happening	release_year	2008
April Fool's Day	has_genre	Horror
Vantage Point	has_tags	assassination
A Patch of Blue	starred_actors	Elizabeth Hartman
Brian's Song	starred_actors	Jack Warden
...
...
...

二、代码

model.py

import numpy as np
import torch
from torch.nn.init import xavier_normal_
import torch.nn as nn
import torch.nn.functional as F


# 用于将知识图谱的实体、关系转为Embedding
class EmbedModel(torch.nn.Module):
    def __init__(self, d, ent_vec_dim, rel_vec_dim, **kwargs):
        super(EmbedModel, self).__init__()

        self.model_name = kwargs["model_name"]
        multiplier = 2
        self.loss_type = kwargs['loss_type']

        if self.loss_type == 'BCE':
            self.loss = self.bce_loss   # self.loss = torch.nn.BCELoss()
            self.bce_loss_loss = torch.nn.BCELoss()
        elif self.loss_type == 'CE':
            self.loss = self.ce_loss
        else:
            # print('Incorrect loss specified:', self.loss_type)
            exit(0)

        self.model = self.ComplEx

        self.E_Embedding = torch.nn.Embedding(len(d.entities), ent_vec_dim * multiplier, padding_idx=0)
        self.R_Embedding = torch.nn.Embedding(len(d.relations), ent_vec_dim * multiplier, padding_idx=0)

        self.entity_dim = ent_vec_dim * multiplier
        self.do_batch_norm = True
        if kwargs["do_batch_norm"] == False:
            self.do_batch_norm = False

        self.input_dropout = torch.nn.Dropout(kwargs["input_dropout"])
        self.hidden_dropout1 = torch.nn.Dropout(kwargs["hidden_dropout1"])
        self.hidden_dropout2 = torch.nn.Dropout(kwargs["hidden_dropout2"])
        self.l3_reg = kwargs["l3_reg"]

        # BatchNorm1d参数
            # 输入维度是(N, C, L)时,num_features应该取C;这里N是batch size,C是数据的channel,L是数据长度。
            # 输入维度是(N, L)时,num_features应该取L;这里N是batch size,L是数据长度,这时可以认为每条数据只有一个channel,省略了C
        self.bn0 = torch.nn.BatchNorm1d(num_features=multiplier)    
        self.bn1 = torch.nn.BatchNorm1d(num_features=multiplier)
        self.bn2 = torch.nn.BatchNorm1d(num_features=multiplier)

        self.logsoftmax = torch.nn.LogSoftmax(dim=-1)

    def init(self):
        xavier_normal_(self.E_Embedding.weight.data)
        xavier_normal_(self.R_Embedding.weight.data)

    def freeze_entity_embeddings(self):
        self.E_Embedding.weight.requires_grad = False

    def ce_loss(self, pred, true):
        pred = F.log_softmax(pred, dim=-1)
        true = true / true.size(-1)
        loss = -torch.sum(pred * true)
        return loss

    def bce_loss(self, pred, true):
        loss = self.bce_loss_loss(pred, true)
        # l3 regularization
        if self.l3_reg:
            norm = torch.norm(self.E_Embedding.weight.data, p=3, dim=-1)
            loss += self.l3_reg * torch.sum(norm)
        return loss

    def ComplEx(self, head, relation):  # head.shape = torch.Size([batch_size, 400]); relation.shape = torch.Size([batch_size, 400])
        heads_tuple = torch.chunk(head, 2, dim=1)  # heads[0].shape = torch.Size([8, 200])
        # print("model.py---->ComplEx---->heads_tuple[0].shape = {0}".format(heads_tuple[0].shape))
        head = torch.stack(list(heads_tuple), dim=1)    # torch.Size([8, 400])---->torch.Size([8, 2, 200])
        if self.do_batch_norm:
            head = self.bn0(head)
        head = self.input_dropout(head)
        head = head.permute(1, 0, 2)    # torch.Size([8, 2, 200])---->torch.Size([2, 8, 200])
        # print("model.py---->ComplEx---->head.shape = {0}".format(head.shape))
        re_head = head[0]   # re_head.shape = torch.Size([8, 200])
        im_head = head[1]   # im_head.shape = torch.Size([8, 200])
        # print("model.py---->ComplEx---->re_head.shape = {0}; im_head.shape = {1}".format(re_head.shape, im_head.shape))

        relation = self.hidden_dropout1(relation)   # relation.shape = torch.Size([8, 400])
        # print("model.py---->ComplEx---->relation.shape = {0}".format(relation.shape))

        re_relation, im_relation = torch.chunk(relation, 2, dim=1)  # re_relation.shape = torch.Size([8, 200]); im_relation.shape = torch.Size([8, 200])
        # print("model.py---->ComplEx---->re_relation.shape = {0}; im_relation.shape = {1}".format(re_relation.shape, im_relation.shape))

        re_tail, im_tail = torch.chunk(self.E_Embedding.weight, 2, dim=1)   # re_tail.shape = torch.Size([14541, 200]); im_tail.shape = torch.Size([14541, 200])
        # print("model.py---->ComplEx---->re_tail.shape = {0}; im_tail.shape = {1}".format(re_tail.shape, im_tail.shape))

        re_score = re_head * re_relation - im_head * im_relation    # re_score.shape = torch.Size([8, 200])
        im_score = re_head * im_relation + im_head * re_relation    # im_score.shape = torch.Size([8, 200])
        # print("model.py---->ComplEx---->re_score.shape = {0}; im_score.shape = {1}".format(re_score.shape, im_score.shape))

        score = torch.stack([re_score, im_score], dim=1)    # score.shape = torch.Size([8, 2, 200])
        # print("model.py---->ComplEx---->score.shape = {0}".format(score.shape))

        if self.do_batch_norm:
            score = self.bn2(score)
        score = self.hidden_dropout2(score)
        score = score.permute(1, 0, 2)  # score.shape = torch.Size([2, 8, 200])
        # print("model.py---->ComplEx---->score.shape = {0}".format(score.shape))

        re_score = score[0]  # re_score.shape = torch.Size([8, 200])
        im_score = score[1]  # im_score.shape = torch.Size([8, 200])

        score = torch.mm(re_score, re_tail.transpose(1, 0)) + torch.mm(im_score, im_tail.transpose(1, 0))   # score.shape = torch.Size([8, 14541])
        # print("model.py---->ComplEx---->score.shape = {0}".format(score.shape))

        return score

    # e1_idx: 一个batch的头实体ids; tensor([12711,  1016, 11215,  5200,  6072,  8968, 11427, 13015], device='cuda:0')
    # r_idx: 一个batch的关系ids; tensor([382, 384, 372, 433, 319, 100, 281, 376], device='cuda:0')

    def forward(self, e1_idx, r_idx):
        e1 = self.E_Embedding(e1_idx)
        r = self.R_Embedding(r_idx)
        # print("model.py---->forward---->e1.shape = {0}; r.shape = {1}".format(e1.shape, r.shape))   # e1.shape = torch.Size([batch_size, 400]); r.shape = torch.Size([batch_size, 400])
        ans = self.model(head=e1, relation=r)
        pred = torch.sigmoid(ans)
        return pred

load_data.py

class Data:
    def __init__(self, data_dir=None, reverse=False):
        self.train_data = self.load_data(data_dir, "train", reverse=reverse)  # train数据集的所有三元组: (head, relation, tail)
        self.valid_data = self.load_data(data_dir, "valid", reverse=reverse)  # valid数据集的所有三元组: (head, relation, tail)
        self.test_data = self.load_data(data_dir, "test", reverse=reverse)  # test数据集的所有三元组: (head, relation, tail)

        self.data = self.train_data + self.valid_data + self.test_data  # 数据集中所有三元组: (head, relation, tail)

        self.entities = self.get_entities(self.data)    # 数据集中所有实体
        print("load_data.py---->数据集中实体总数量:len(self.entities) = ", len(self.entities))

        self.train_relations = self.get_relations(self.train_data)  # train数据集的所有三元组中的关系
        self.valid_relations = self.get_relations(self.valid_data)  # valid数据集的所有三元组中的关系
        self.test_relations = self.get_relations(self.test_data)  # test数据集的所有三元组中的关系

        self.relations = self.train_relations + [i for i in self.valid_relations if i not in self.train_relations] + [i for i in self.test_relations if i not in self.train_relations]  # 数据集中所有关系
        print("load_data.py---->数据集中关系总数量:len(self.relations) = ", len(self.relations))

    def load_data(self, data_dir, data_type="train", reverse=False):
        file_path = "%s%s.txt" % (data_dir, data_type)  # file_path = data/FB15k-237/train.txt
        print("data_dir = {0}; data_type = {1}; file_path = {2}".format(data_dir, data_type, file_path))

        with open(file_path, "r") as f:
            data = f.read().strip().split("\n")
            data = [i.split('\t') for i in data]
            # 将三元组关系进行翻转,训练数量翻倍
            if reverse:
                data += [[i[2], i[1]+"_reverse", i[0]] for i in data]
        return data

    # 获取data中所有三元组中的实体
    def get_entities(self, data):
        entities = sorted(list(set([d[0] for d in data]+[d[2] for d in data])))
        return entities

    # 获取data中所有三元组中的关系
    def get_relations(self, data):
        relations = sorted(list(set([d[1] for d in data])))
        return relations

trainer.py

import numpy as np
import torch
import time
from collections import defaultdict
from model import *
from torch.optim.lr_scheduler import ExponentialLR
from tqdm import tqdm
import os


class Trainer:
    def __init__(self, d=None, learning_rate=0.0005, ent_vec_dim=200, rel_vec_dim=200, num_iterations=500, batch_size=128, decay_rate=0., cuda=False,
                 input_dropout=0.3, hidden_dropout1=0.4, hidden_dropout2=0.5, label_smoothing=0., outfile='tucker.model', valid_steps=1,
                 loss_type='BCE', do_batch_norm=1, dataset_name='', model_name='ComplEx', l3_reg=0.0, load_from=''):

        self.d = d  # 所有数据集(train数据集、valid数据集、test数据集)
        self.dataset_name = dataset_name    # 数据集名称
        self.learning_rate = learning_rate
        self.ent_vec_dim = ent_vec_dim  # 实体embedding之后的维度
        self.rel_vec_dim = rel_vec_dim  # 关系embedding之后的维度
        self.num_epochs = num_iterations
        self.batch_size = batch_size
        self.decay_rate = decay_rate
        self.label_smoothing = label_smoothing  # 标签平滑
        self.cuda = cuda
        self.outfile = outfile
        self.valid_steps = valid_steps
        self.model_name = model_name
        self.l3_reg = l3_reg
        self.loss_type = loss_type
        self.load_from = load_from
        if do_batch_norm == 1:
            do_batch_norm = True
        else:
            do_batch_norm = False
        self.kwargs = {"input_dropout": input_dropout, "hidden_dropout1": hidden_dropout1, "hidden_dropout2": hidden_dropout2, "model_name": model_name, "loss_type": loss_type, "do_batch_norm": do_batch_norm, "l3_reg": l3_reg}

    # 将三元组转换为id的形式:['/m/027rn', '/location/country/fo...government', '/m/06cx9']---->(3818, 244, 8942)
    def get_data_idxs(self, data):
        data_idxs = [(self.entity2idxs[data[i][0]], self.relation2idxs[data[i][1]], self.entity2idxs[data[i][2]]) for i in range(len(data))]
        return data_idxs

    # 获取给定(头实体, 关系)的所有尾实体:(head, relation):[tail01, tail02...]
    def get_er_vocab(self, data):
        er_vocab = defaultdict(list)
        for triple in data:
            er_vocab[(triple[0], triple[1])].append(triple[2])
        return er_vocab

    # 获取一个batch的数据
    def get_batch(self, er_vocab, er_vocab_pairs, batch_idx):
        batch = er_vocab_pairs[batch_idx:batch_idx + self.batch_size]   # batch_size = 128
        batch_size = len(batch)
        num_entities = len(self.d.entities)
        targets = torch.zeros([batch_size, num_entities], dtype=torch.float32)  # targets.shape =  torch.Size([128, 14541])
        # print("\ntrain_embeddings---->trainer.py---->get_batch---->targets.shape = ", targets.shape)
        if self.cuda:
            targets = targets.cuda()
        for batch_idx, pair in enumerate(batch):
            target_entities_idx = er_vocab[pair]
            targets[batch_idx, target_entities_idx] = 1.
        return np.array(batch), targets

    def train_and_eval(self):
        print("\ntrain_embeddings---->trainer.py---->train_and_eval:")
        torch.set_num_threads(2)
        best_valid = [0, 0, 0, 0, 0]
        best_test = [0, 0, 0, 0, 0]
        num_entities = len(self.d.entities)
        num_relations = len(self.d.relations)

        self.entity2idxs = {self.d.entities[i]: i for i in range(num_entities)}  # 实体、id映射表
        self.relation2idxs = {self.d.relations[i]: i for i in range(num_relations)}  # 关系、id映射表

        # 将实体与id对应的字典保存
        with open('data/' + self.dataset_name + '/entities.dict', 'w') as f:
            for key, value in self.entity2idxs.items():
                f.write(key + '\t' + str(value) + '\n')

        # 将关系与id对应的字典保存
        with open('data/' + self.dataset_name + '/relations.dict', 'w') as f:
            for key, value in self.relation2idxs.items():
                f.write(key + '\t' + str(value) + '\n')

        print("trainer.py---->train_and_eval---->len(self.d.train_data) = ", len(self.d.train_data))    # 544230(已经将train.txt中的272115个三元组进行翻转,训练数量翻倍)

        # 将三元组转换为id的形式:['/m/027rn', '/location/country/fo...government', '/m/06cx9']---->(3818, 244, 8942)
        train_data_idxs = self.get_data_idxs(self.d.train_data)  # train_data_idxs = [(3818, 244, 8942), (819, 460, 9234), (9791, 280, 756), (2522, 24, 7022),...]

        print("Number of training data points: %d" % len(train_data_idxs))  # 544230(三元组的数量)
        print('Entities: %d' % len(self.entity2idxs))   # 14541
        print('Relations: %d' % len(self.relation2idxs))    # 474

        # 初始化模型(d: 通过Data.py加载的数据; )
        model = EmbedModel(self.d, self.ent_vec_dim, self.rel_vec_dim, **self.kwargs)
        model.init()    # 初始化参数

        # 加载已经保存的模型参数
        if self.load_from != '':
            fname = self.load_from
            checkpoint = torch.load(fname)
            model.load_state_dict(checkpoint)

        # 使用GPU
        if self.cuda:
            model.cuda()

        # 初始化优化器
        opt = torch.optim.Adam(model.parameters(), lr=self.learning_rate)

        # 设置学习率
        if self.decay_rate:
            scheduler = ExponentialLR(opt, self.decay_rate)

        er_vocab = self.get_er_vocab(train_data_idxs)   # 获取给定(头实体, 关系)的所有尾实体:(head, relation):[tail01, tail02...]; 一共149689个
        er_vocab_pairs = list(er_vocab.keys())  # 所有的 (head, relation)

        print("Starting training...")

        # 开始训练epoch
        for epoch_idx in range(1, self.num_epochs + 1):
            start_train = time.time()
            model.train()
            losses = []
            np.random.shuffle(er_vocab_pairs)
            # 训练batch
            for j in tqdm(range(0, len(er_vocab_pairs), self.batch_size)):
                # print("trainer.py---->train_and_eval---->j = ", j)
                # 获取一个batch的训练数据
                X, labels = self.get_batch(er_vocab, er_vocab_pairs, j)
                # 梯度置零
                opt.zero_grad()
                # 获取头实体、关系
                e1_idx, r_idx = torch.tensor(X[:, 0]), torch.tensor(X[:, 1])  # 头实体的id, 关系的id

                if self.cuda:
                    e1_idx = e1_idx.cuda()
                    r_idx = r_idx.cuda()

                # 利用模型根据(头实体,关系)预测(尾实体)
                predictions = model.forward(e1_idx, r_idx)

                if self.label_smoothing:
                    labels = ((1.0 - self.label_smoothing) * labels) + (1.0 / labels.size(1))

                # 计算loss
                loss = model.loss(predictions, labels)  # predictions.shape = torch.Size([8, 14541])    labels.shape = torch.Size([8, 14541])
                # 梯度反向传播
                loss.backward()
                # 更新参数
                opt.step()
                losses.append(loss.item())

            # 每一个epoch进行一个学习率调整
            if self.decay_rate:
                scheduler.step()
            # 每100个epoch打印一次
            if epoch_idx % 100 == 0:
                print('Epoch', epoch_idx, ' Epoch time', time.time() - start_train, ' Loss:', np.mean(losses))

            # 每一个epoch进行一次验证、测试
            model.eval()
            with torch.no_grad():
                if epoch_idx % self.valid_steps == 0:
                    start_test = time.time()

                    print("\n\n开始验证-Valid:")
                    valid = self.evaluate(model, self.d.valid_data)

                    print("\n\n开始测试-Test:")
                    test = self.evaluate(model, self.d.test_data)

                    valid_mrr = valid[0]
                    test_mrr = test[0]

                    if valid_mrr >= best_valid[0]:
                        best_valid = valid
                        best_test = test
                        print('Validation MRR increased.')
                        print('Saving model...')
                        self.write_embedding_files(model)
                        print('Model saved!')

                    print('Best valid:', best_valid)
                    print('Best Test:', best_test)
                    print('Dataset:', self.dataset_name)
                    print('Model:', self.model_name)
                    print(time.time() - start_test)
                    print('Learning rate %f | Decay %f | Dim %d | Input drop %f | Hidden drop 2 %f | LS %f | Batch size %d | Loss type %s | L3 reg %f' % (self.learning_rate,
                                                                                                                                                          self.decay_rate,
                                                                                                                                                          self.ent_vec_dim,
                                                                                                                                                          self.kwargs["input_dropout"],
                                                                                                                                                          self.kwargs["hidden_dropout2"],
                                                                                                                                                          self.label_smoothing,
                                                                                                                                                          self.batch_size,
                                                                                                                                                          self.loss_type,
                                                                                                                                                          self.l3_reg))
    # 验证

    def evaluate(self, model, data):
        print("train_embeddings---->trainer.py---->evaluate:")
        model.eval()
        hits = [[] for _ in range(10)]
        ranks = []
        # 将所有valid/test数据集三元组转换为id的形式:['/m/027rn', '/location/country/fo...government', '/m/06cx9']---->(3818, 244, 8942)
        test_data_idxs = self.get_data_idxs(data)   # 35070
        # 获取给定(头实体, 关系)的所有尾实体:(head, relation):[tail01, tail02...];  (5304, 8):[7793, 8554, 1084]
        er_vocab = self.get_er_vocab(test_data_idxs)

        print("Number of data points: %d" % len(test_data_idxs))    # 35070

        # 按照batch_size进行迭代
        len_test_data_idxs = len(test_data_idxs)    # 35070
        for i in tqdm(range(0, len_test_data_idxs, self.batch_size)):
            data_batch = np.array(test_data_idxs[i: i + self.batch_size])   # 当前batch的所有数据 data_batch.shape =  (batch_size, 3)
            print("trainer.py---->evaluate---->data_batch.shape = {0}; data_batch = \n{1}".format(data_batch.shape, data_batch))

            e1_idx = torch.tensor(data_batch[:, 0])  # 头实体 tensor([ 9738,  9271,  2570,  5304,  9589, 12527,  8687,  2560])
            r_idx = torch.tensor(data_batch[:, 1])  # 关系 tensor([170, 262, 392,   8, 192, 190, 456,  46])
            e2_idx = torch.tensor(data_batch[:, 2])  # 尾实体 tensor([4553, 4280, 7855, 7793, 7413, 3942, 4366, 8369])

            if self.cuda:
                e1_idx = e1_idx.cuda()
                r_idx = r_idx.cuda()
                e2_idx = e2_idx.cuda()

            # 将(头实体,关系)输入模型,预测尾实体
            predictions = model.forward(e1_idx, r_idx)  # predictions.shape =  torch.Size([batch_size, 14541])
            print("trainer.py---->evaluate---->predictions.shape = ", predictions.shape)

            # following lines commented means RAW evaluation (not filtered)
            batch_size = data_batch.shape[0]
            for i in range(batch_size):
                e1_r_idx = (data_batch[i][0], data_batch[i][1])  # (5304, 8)
                filt = er_vocab[e1_r_idx]   # 当前三元组的(头实体,关系)对应的所有尾实体的index:[7793, 8554, 1084]    根据给定(头实体, 关系)获取所有尾实体:(head, relation):[tail01, tail02...];  (5304, 8):[7793, 8554, 1084]
                e2_idx_i = e2_idx[i]    # 当前三元组样本中真实尾实体的index:7793
                target_value = predictions[i, e2_idx_i].item()  # 预测得到的真实的尾实体的概率    0.17887896299362183
                # 将(头实体, 关系)对应所有尾实体处的概率先置零,只留当前三元组样本中的尾实体概率
                predictions[i, filt] = 0.0
                predictions[i, e2_idx_i] = target_value

            # 通过sort获取概率最大的尾实体的index
            sort_values, sort_idxs = torch.sort(predictions, dim=1, descending=True)  # dim=1:为预测尾实体在所有14541候选实体上的概率值
            sort_idxs = sort_idxs.cpu().numpy()  # array([[ 9791,  8454,  4553, ...,  2466,  2058,  8743],
            for i in range(batch_size):
                sort_idxs_i = sort_idxs[i]  # 当前样本的预测index根据概率值从大到小排序后的排序列表
                e2_idx_i = e2_idx[i].item()  # 当前样本的真实尾实体的index   4553
                filt_tuple = np.where(sort_idxs_i == e2_idx_i)  # 当np.where()内只有一个参数时,那个参数表示条件,当条件成立时,where返回的是每个符合condition条件元素的坐标,返回的是以元组的形式 (array([2]),)
                rank = filt_tuple[0][0]
                ranks.append(rank + 1)

                # 获取hits数据
                for hits_level in range(10):
                    if rank <= hits_level:
                        hits[hits_level].append(1.0)
                    else:
                        hits[hits_level].append(0.0)

        print("trainer.py---->evaluate---->len(hits) = ", len(hits))    # (10, 35070)

        # 分别计算hitat1、hitat3、hitat10、meanrank、mrr
        hitat10 = np.mean(hits[9])  # 0.24103222127174223
        hitat3 = np.mean(hits[2])   # 0.15260906757912746
        hitat1 = np.mean(hits[0])   # 0.09994297120045623
        meanrank = np.mean(ranks)   # 2097.5095238095237
        mrr = np.mean(1. / np.array(ranks))  # 0.14575755271227386

        print('Hits @10: {0}'.format(hitat10))
        print('Hits @3: {0}'.format(hitat3))
        print('Hits @1: {0}'.format(hitat1))
        print('Mean rank: {0}'.format(meanrank))
        print('Mean reciprocal rank: {0}'.format(mrr))

        return [mrr, meanrank, hitat10, hitat3, hitat1]

    # 保存实体、关系的Embedding
    def write_embedding_files(self, model):
        print("\ntrain_embeddings---->trainer.py---->write_embedding_files:")
        model.eval()
        model_folder = "kg_embeddings/%s/" % self.dataset_name  # 'kg_embeddings/FB15k-237/'
        data_folder = "data/%s/" % self.dataset_name  # 'data/FB15k-237/'

        print("\ntrain_embeddings---->trainer.py---->write_embedding_files---->model_folder = ", model_folder)
        print("\ntrain_embeddings---->trainer.py---->write_embedding_files---->data_folder = ", data_folder)

        embedding_type = self.model_name
        if os.path.exists(model_folder) == False:
            print("创建目录: ", model_folder)
            os.makedirs(model_folder)

        E_numpy = model.E_Embedding.weight.data.cpu().numpy()
        R_numpy = model.R_Embedding.weight.data.cpu().numpy()

        print("train_embeddings---->trainer.py---->write_embedding_files----E_numpy.shape = ", E_numpy.shape)
        print("train_embeddings---->trainer.py---->write_embedding_files----R_numpy.shape = ", R_numpy.shape)

        bn_list = []
        for bn in [model.bn0, model.bn1, model.bn2]:
            bn_weight = bn.weight.data.cpu().numpy()
            bn_bias = bn.bias.data.cpu().numpy()
            bn_running_mean = bn.running_mean.data.cpu().numpy()
            bn_running_var = bn.running_var.data.cpu().numpy()

            bn_numpy = {}
            bn_numpy['weight'] = bn_weight
            bn_numpy['bias'] = bn_bias
            bn_numpy['running_mean'] = bn_running_mean
            bn_numpy['running_var'] = bn_running_var

            bn_list.append(bn_numpy)

        np.save(model_folder + '/E.npy', E_numpy)   # 保存实体的Embedding
        np.save(model_folder + '/R.npy', R_numpy)   # 保存关系的Embedding

        # 保存BatchNorm参数
        for i, bn in enumerate(bn_list):
            np.save(model_folder + '/bn' + str(i) + '.npy', bn)

        if embedding_type == 'TuckER':
            W_numpy = model.W.detach().cpu().numpy()
            np.save(model_folder + '/W.npy', W_numpy)  # 保存权重

        # ------------------------------------------------------------ 拷贝dict数据 ------------------------------------------------------------
        # 将数据集文件夹中的entities.dict拷贝到目标文件夹中
        f1 = open(data_folder + '/entities.dict', 'r')
        f2 = open(model_folder + '/entities.dict', 'w')

        ents = {}
        idx2ent = {}
        for line in f1:
            line = line.rstrip().split('\t')
            name = line[0]
            id = int(line[1])
            ents[name] = id
            idx2ent[id] = name
            f2.write(str(id) + '\t' + name + '\n')
        f1.close()
        f2.close()

        # 将数据集文件夹中的relations.dict拷贝到目标文件夹中
        f1 = open(data_folder + '/relations.dict', 'r')
        f2 = open(model_folder + '/relations.dict', 'w')
        rels = {}
        idx2rel = {}
        for line in f1:
            line = line.strip().split('\t')
            name = line[0]
            id = int(line[1])
            rels[name] = id
            idx2rel[id] = name
            f2.write(str(id) + '\t' + name + '\n')
        f1.close()
        f2.close()

utils.py

import os
import torch
import numpy as np
import random


def seed_everything(seed=1029):
    '''
    设置整个开发环境的seed
    :param seed:
    :param device:
    :return:
    '''
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available:
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    # some cudnn methods can be random even after fixing the seed unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True

main.py

import os
from load_data import Data
import torch
from model import *
from trainer import Trainer
import argparse
from utils import seed_everything


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--dataset_name", type=str, default="MetaQA", nargs="?", help="Which dataset to use: FB15k, FB15k-237, MetaQA, WN18 or WN18RR.")
    parser.add_argument("--num_iterations", type=int, default=500, nargs="?", help="Number of iterations.")
    parser.add_argument("--batch_size", type=int, default=5120, nargs="?", help="Batch size.")   # 128
    parser.add_argument("--lr", type=float, default=0.0005, nargs="?", help="Learning rate.")
    parser.add_argument("--model", type=str, default='ComplEx', nargs="?", help="Model.")    # Rotat3
    parser.add_argument("--dr", type=float, default=1.0, nargs="?", help="Decay rate.")
    parser.add_argument("--edim", type=int, default=200, nargs="?", help="Entity embedding dimensionality.")
    parser.add_argument("--rdim", type=int, default=200, nargs="?", help="Relation embedding dimensionality.")
    parser.add_argument("--cuda", type=bool, default=True, nargs="?", help="Whether to use cuda (GPU) or not (CPU).")
    parser.add_argument("--input_dropout", type=float, default=0.3, nargs="?", help="Input layer dropout.")
    parser.add_argument("--hidden_dropout1", type=float, default=0.4, nargs="?", help="Dropout after the first hidden layer.")
    parser.add_argument("--hidden_dropout2", type=float, default=0.5, nargs="?", help="Dropout after the second hidden layer.")
    parser.add_argument("--label_smoothing", type=float, default=0.1, nargs="?", help="Amount of label smoothing.")
    parser.add_argument("--outfile", type=str, default='tucker.model', nargs="?", help="File to save")
    parser.add_argument("--valid_steps", type=int, default=1, nargs="?", help="Epochs before u validate")
    parser.add_argument("--loss_type", type=str, default='BCE', nargs="?", help="Loss type")
    parser.add_argument("--do_batch_norm", type=int, default=1, nargs="?", help="Do batch norm or not (0, 1)")
    parser.add_argument("--l3_reg", type=float, default=0.0, nargs="?", help="l3 reg hyperparameter")
    parser.add_argument("--load_from", type=str, default='', nargs="?", help="load from state dict")

    args = parser.parse_args()

    os.environ["CUDA_VISIBLE_DEVICES"] = "3"

    dataset_name = args.dataset_name
    data_dir = "data/%s/" % dataset_name    # data_dir =  data/MetaQA/
    print("\ntrain_embeddings---->main.py---->data_dir = ", data_dir)

    # 设置统一的随机种子
    seed_everything()

    # 读取并构建数据
    d = Data(data_dir=data_dir, reverse=True)

    trainer = Trainer(d=d,
                      num_iterations=args.num_iterations,
                      batch_size=args.batch_size,
                      learning_rate=args.lr,
                      decay_rate=args.dr,
                      ent_vec_dim=args.edim,
                      rel_vec_dim=args.rdim,
                      cuda=args.cuda,
                      input_dropout=args.input_dropout,
                      hidden_dropout1=args.hidden_dropout1,
                      hidden_dropout2=args.hidden_dropout2,
                      label_smoothing=args.label_smoothing,
                      outfile=args.outfile,
                      valid_steps=args.valid_steps,
                      loss_type=args.loss_type,
                      do_batch_norm=args.do_batch_norm,
                      dataset_name=args.dataset_name,
                      model_name=args.model,
                      l3_reg=args.l3_reg,
                      load_from=args.load_from)

    trainer.train_and_eval()

知识图谱的几个经典模型:TransE、Trans R、ComplEx、ConvKB_0x3fffffff的博客-CSDN博客_知识图谱模型

你可能感兴趣的:(#,知识图谱,人工智能)