码猿小菜鸡

Transformer架构完整代码示例

Transformer架构完整代码

#!/usr/bin/python3.9
# -*- coding: utf-8 -*-
# @Time    : 2023/6/29 10:48
# @File    : abd_transformer_cyd.py
# @Software: PyCharm

import math
import torch
import collections
import numpy as np
import torch.nn as nn
from copy import deepcopy
import torch.nn.functional as F
from torch.autograd import Variable

# 让Hypothesis拥有可访问的属性，即Hypothesis.value
Hypothesis = collections.namedtuple('Hypothesis', ['value', 'score'])

def clone_module_to_modulelist(module, module_num):
    """
    克隆n个Module类放入ModuleList中，并返回ModuleList，这个ModuleList中的每个Module都是一模一样的
    nn.ModuleList，它是一个储存不同 module，并自动将每个 module 的 parameters 添加到网络之中的容器。
    你可以把任意 nn.Module 的子类 (比如 nn.Conv2d, nn.Linear 之类的) 加到这个 list 里面，
    加入到 nn.ModuleList 里面的 module 是会自动注册到整个网络上的，
    同时 module 的 parameters 也会自动添加到整个网络中。
    :param module: 被克隆的module
    :param module_num: 被克隆的module数
    :return: 装有module_num个相同module的ModuleList
    """
    return nn.ModuleList([deepcopy(module) for _ in range(module_num)])


class LayerNorm(nn.Module):
    """
    构建一个LayerNorm Module
    LayerNorm的作用：对x归一化，使x的均值为0，方差为1
    LayerNorm计算公式：x-mean(x)/\sqrt{var(x)+\epsilon} = x-mean(x)/std(x)+\epsilon
    """

    def __init__(self, x_size, eps=1e-6):
        """
        :param x_size: 特征的维度
        :param eps: eps是一个平滑的过程，取值通常在（10^-4~10^-8 之间）
        其含义是，对于每个参数，随着其更新的总距离增多，其学习速率也随之变慢。
        防止出现除以0的情况。

        nn.Parameter将一个不可训练的类型Tensor转换成可以训练的类型parameter，
        并将这个parameter绑定到这个module里面。
        使用这个函数的目的也是想让某些变量在学习的过程中不断的修改其值以达到最优化。
        """
        super(LayerNorm, self).__init__()
        self.ones_tensor = nn.Parameter(torch.ones(x_size))  # 按照特征向量大小返回一个全1的张量，并且转换成可训练的parameter类型
        self.zeros_tensor = nn.Parameter(torch.zeros(x_size))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)  # 求标准差
        return self.ones_tensor * (x - mean) / (std + self.eps) + self.zeros_tensor  # LayerNorm的计算公式


class FeatEmbedding(nn.Module):
    """
    视频特征向量生成器
    """

    def __init__(self, d_feat, d_model, dropout):
        """
        FeatEmbedding的初始化
        :param d_feat: per frame dimension（每帧的维度），作为Linear层输入的维度
        :param d_model: 作为Linear层输出的维度
        :param dropout: Dropout层的比率

        nn.Sequential：这是一个有顺序的容器，将特定神经网络模块按照在传入构造器的顺序依次被添加到计算图中
        在这里构造的容器是：LayerNorm --> Dropout --> Linear
        """
        super(FeatEmbedding, self).__init__()
        self.video_embeddings = nn.Sequential(
            # TODO:为什么这里对视频做处理，即图片做处理，不使用BatchNorm
            # nn.BatchNorm2d(d_feat)
            # nn.LayerNorm(d_feat),
            LayerNorm(d_feat),
            nn.Dropout(p=dropout),
            nn.Linear(d_feat, d_model)
        )

    def forward(self, x):
        return self.video_embeddings(x)  # 返回被处理后的视频特征向量


class WordEmbedding(nn.Module):
    """
    把向量构造成d_model维度的词向量，以便后续送入编码器
    """

    def __init__(self, vocab_size, d_model):
        """
        :param vocab_size: 字典长度
        :param d_model: 词向量维度
        """
        super(WordEmbedding, self).__init__()
        self.d_model = d_model
        # 字典中有vocab_size个词，词向量维度是d_model，每个词将会被映射成d_model维度的向量
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.embed = self.embedding

    def forward(self, x):
        # TODO：为什么要乘以一个sqrt，Transformer中的？
        return self.embed(x) * math.sqrt(self.d_model)


class PositionalEncoding(nn.Module):
    """
    正弦位置编码，即通过三角函数构建位置编码

    Implementation based on "Attention Is All You Need"
    :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`
    """

    def __init__(self, dim: int, dropout: float, max_len=5000):
        """
        :param dim: 位置向量的向量维度，一般与词向量维度相同，即d_model
        :param dropout: Dropout层的比率
        :param max_len: 句子的最大长度
        """
        # 判断能够构建位置向量
        if dim % 2 != 0:
            raise ValueError(f"不能使用 sin/cos 位置编码，得到了奇数的维度{dim:d}，应该使用偶数维度")

        """
        构建位置编码pe
        pe公式为：
        PE(pos,2i/2i+1) = sin/cos(pos/10000^{2i/d_{model}})
        """
        pe = torch.zeros(max_len, dim)  # 初始化pe
        position = torch.arange(0, max_len).unsqueeze(1)  # 构建pos，为句子的长度，相当于pos
        div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) * torch.tensor(
            -(math.log(10000.0) / dim))))  # 复现位置编码sin/cos中的公式
        pe[:, 0::2] = torch.sin(position.float() * div_term)  # 偶数使用sin函数
        pe[:, 1::2] = torch.cos(position.float() * div_term)  # 奇数使用cos函数
        pe = pe.unsqueeze(1)  # 扁平化成一维向量

        super(PositionalEncoding, self).__init__()
        self.register_buffer('pe', pe)  # pe不是模型的一个参数，通过register_buffer把pe写入内存缓冲区，当做一个内存中的常量
        self.drop_out = nn.Dropout(p=dropout)
        self.dim = dim

    def forward(self, emb, step=None):
        """
        词向量和位置编码拼接并输出
        :param emb: 词向量序列（FloatTensor），``(seq_len, batch_size, self.dim)``
        :param step: 如果 stepwise("seq_len=1")，则用此位置的编码
        :return: 词向量和位置编码的拼接
        """
        emb = emb * math.sqrt(self.dim)
        if step is None:
            emb = emb + self.pe[:emb.size(0)]  # 拼接词向量和位置编码
        else:
            emb = emb + self.pe[step]
        emb = self.drop_out(emb)
        return emb


def self_attention(query, key, value, dropout=None, mask=None):
    """
    自注意力计算
    :param query: Q
    :param key: K
    :param value: V
    :param dropout: drop比率
    :param mask: 是否mask
    :return: 经自注意力机制计算后的值
    """
    d_k = query.size(-1)  # 防止softmax未来求梯度消失时的d_k
    # Q,K相似度计算公式：\frac{Q^TK}{\sqrt{d_k}}
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)  # Q,K相似度计算
    # 判断是否要mask，注：mask的操作在QK之后，softmax之前
    if mask is not None:
        """
        scores.masked_fill默认是按照传入的mask中为1的元素所在的索引，
        在scores中相同的的索引处替换为value，替换值为-1e9，即-(10^9)
        """
        # mask.cuda()
        # 进行mask操作，由于参数mask==0，因此替换上述mask中为0的元素所在的索引

    scores = scores.masked_fill(mask == 0, -1e9)

    self_attn_softmax = F.softmax(scores, dim=-1)  # 进行softmax
    # 判断是否要对相似概率分布进行dropout操作
    if dropout is not None:
        self_attn_softmax = dropout(self_attn_softmax)

    # 注意：返回经自注意力计算后的值，以及进行softmax后的相似度（即相似概率分布）
    return torch.matmul(self_attn_softmax, value), self_attn_softmax


class MultiHeadAttention(nn.Module):
    """
    多头注意力计算
    """

    def __init__(self, head, d_model, dropout=0.1):
        """
        :param head: 头数
        :param d_model: 词向量的维度，必须是head的整数倍
        :param dropout: drop比率
        """
        super(MultiHeadAttention, self).__init__()
        assert (d_model % head == 0)  # 确保词向量维度是头数的整数倍
        self.d_k = d_model // head  # 被拆分为多头后的某一头词向量的维度
        self.head = head
        self.d_model = d_model

        """
        由于多头注意力机制是针对多组Q、K、V，因此有了下面这四行代码，具体作用是，
        针对未来每一次输入的Q、K、V，都给予参数进行构建
        其中linear_out是针对多头汇总时给予的参数
        """
        self.linear_query = nn.Linear(d_model, d_model)  # 进行一个普通的全连接层变化，但不修改维度
        self.linear_key = nn.Linear(d_model, d_model)
        self.linear_value = nn.Linear(d_model, d_model)
        self.linear_out = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(p=dropout)
        self.attn_softmax = None  # attn_softmax是能量分数, 即句子中某一个词与所有词的相关性分数， softmax(QK^T)

    def forward(self, query, key, value, mask=None):
        if mask is not None:
            """
            多头注意力机制的线性变换层是4维，是把query[batch, frame_num, d_model]变成[batch, -1, head, d_k]
            再1，2维交换变成[batch, head, -1, d_k], 所以mask要在第二维（head维）添加一维，与后面的self_attention计算维度一样
            具体点将，就是：
            因为mask的作用是未来传入self_attention这个函数的时候，作为masked_fill需要mask哪些信息的依据
            针对多head的数据，Q、K、V的形状维度中，只有head是通过view计算出来的，是多余的，为了保证mask和
            view变换之后的Q、K、V的形状一直，mask就得在head这个维度添加一个维度出来，进而做到对正确信息的mask
            """
            mask = mask.unsqueeze(1)

        n_batch = query.size(0)  # batch_size大小，假设query的维度是：[10, 32, 512]，其中10是batch_size的大小

        """
        下列三行代码都在做类似的事情，对Q、K、V三个矩阵做处理
        其中view函数是对Linear层的输出做一个形状的重构，其中-1是自适应（自主计算）
        从这种重构中，可以看出，虽然增加了头数，但是数据的总维度是没有变化的，也就是说多头是对数据内部进行了一次拆分
        transopose(1,2)是对前形状的两个维度(索引从0开始)做一个交换，例如(2,3,4,5)会变成(2,4,3,5)
        因此通过transpose可以让view的第二维度参数变成n_head
        假设Linear成的输出维度是：[10, 32, 512]，其中10是batch_size的大小
        注：这里解释了为什么d_model // head == d_k，如若不是，则view函数做形状重构的时候会出现异常
        """
        query = self.linear_query(query).view(n_batch, -1, self.head, self.d_k).transpose(1, 2)  # [b, 8, 32, 64]，head=8
        key = self.linear_key(key).view(n_batch, -1, self.head, self.d_k).transpose(1, 2)  # [b, 8, 28, 64]
        value = self.linear_value(value).view(n_batch, -1, self.head, self.d_k).transpose(1, 2)  # [b, 8, 28, 64]

        # x是通过自注意力机制计算出来的值， self.attn_softmax是相似概率分布
        x, self.attn_softmax = self_attention(query, key, value, dropout=self.dropout, mask=mask)

        """
        下面的代码是汇总各个头的信息，拼接后形成一个新的x
        其中self.head * self.d_k，可以看出x的形状是按照head数拼接成了一个大矩阵，然后输入到linear_out层添加参数
        contiguous()是重新开辟一块内存后存储x，然后才可以使用.view方法，否则直接使用.view方法会报错
        """
        x = x.transpose(1, 2).contiguous().view(n_batch, -1, self.head * self.d_k)
        return self.linear_out(x)


class FeedForward(nn.Module):
    """
    两层具有残差网络的前馈神经网络，FNN网络
    """

    def __init__(self, d_model: int, d_ff: int, dropout=0.1):
        """
        :param d_model: FFN第一层输入的维度
        :param d_ff: FNN第二层隐藏层输入的维度
        :param dropout: drop比率
        """
        super(FeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout_1 = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        self.dropout_2 = nn.Dropout(dropout)

    def forward(self, x):
        """
        :param x: 输入数据，形状为(batch_size, input_len, model_dim)
        :return: 输出数据（FloatTensor），形状为(batch_size, input_len, model_dim)
        """
        inter = self.dropout_1(self.relu(self.w_1(self.layer_norm(x))))
        output = self.dropout_2(self.w_2(inter))
        # return output + x，即为残差网络
        return output  # + x


class SublayerConnection(nn.Module):
    """
    子层的连接: layer_norm(x + sublayer(x))
    上述可以理解为一个残差网络加上一个LayerNorm归一化
    """

    def __init__(self, size, dropout=0.1):
        """
        :param size: d_model
        :param dropout: drop比率
        """
        super(SublayerConnection, self).__init__()
        self.layer_norm = LayerNorm(size)
        # TODO：在SublayerConnection中LayerNorm可以换成nn.BatchNorm2d
        # self.layer_norm = nn.BatchNorm2d()
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, sublayer):
        return self.dropout(self.layer_norm(x + sublayer(x)))


class EncoderLayer(nn.Module):
    """
    一层编码Encoder层
    MultiHeadAttention -> Add & Norm -> Feed Forward -> Add & Norm
    """

    def __init__(self, size, attn, feed_forward, dropout=0.1):
        """
        :param size: d_model
        :param attn: 已经初始化的Multi-Head Attention层
        :param feed_forward: 已经初始化的Feed Forward层
        :param dropout: drop比率
        """
        super(EncoderLayer, self).__init__()
        self.attn = attn
        self.feed_forward = feed_forward

        """
        下面一行的作用是因为一个Encoder层具有两个残差结构的网络
        因此构建一个ModuleList存储两个SublayerConnection，以便未来对数据进行残差处理
        """
        self.sublayer_connection_list = clone_module_to_modulelist(SublayerConnection(size, dropout), 2)

    def forward(self, x, mask):
        """
        :param x: Encoder层的输入
        :param mask: mask标志
        :return: 经过一个Encoder层处理后的输出
        """
        """
        编码层第一层子层
        self.attn 应该是一个已经初始化的Multi-Head Attention层
        把Encoder的输入数据x和经过一个Multi-Head Attention处理后的x_attn送入第一个残差网络进行处理得到first_x
        """
        first_x = self.sublayer_connection_list[0](x, lambda x_attn: self.attn(x, x, x, mask))

        """
        编码层第二层子层
        把经过第一层子层处理后的数据first_x与前馈神经网络送入第二个残差网络进行处理得到Encoder层的输出
        """
        return self.sublayer_connection_list[1](first_x, self.feed_forward)


class DecoderLayer(nn.Module):
    """
    一层解码Decoder层
    Mask MultiHeadAttention -> Add & Norm -> Multi-Head Attention -> Add & Norm
    -> Feed Forward -> Add & Norm
    """

    def __init__(self, d_model, attn, feed_forward, sublayer_num, dropout=0.1):
        """
        :param d_model: d_model
        :param attn: 已经初始化的Multi-Head Attention层
        :param feed_forward: 已经初始化的Feed Forward层
        :param sublayer_num: 解码器内部子层数，如果未来r2l_memory传入有值，则为4层，否则为普通的3层
        :param dropout: drop比率
        """
        super(DecoderLayer, self).__init__()
        self.attn = attn
        self.feed_forward = feed_forward
        self.sublayer_connection_list = clone_module_to_modulelist(SublayerConnection(d_model, dropout), sublayer_num)

    def forward(self, x, l2r_memory, src_mask, trg_mask, r2l_memory=None, r2l_trg_mask=None):
        """
        :param x: Decoder的输入(captioning)
        :param l2r_memory: Encoder的输出，作为Multi-Head Attention的K，V值，为从左到右的Encoder的输出
        :param src_mask: 编码器输入的填充掩码
        :param trg_mask: 解码器输入的填充掩码和序列掩码，即对后面单词的掩码
        :param r2l_memory: 从右到左解码器的输出
        :param r2l_trg_mask: 从右到左解码器的输出的填充掩码和序列掩码
        :return: Encoder的输出
        """
        """
        解码器第一层子层
        把Decoder的输入数据x和经过一个Masked Multi-Head Attention处理后的first_x_attn送入第一个残差网络进行处理得到first_x
        """
        first_x = self.sublayer_connection_list[0](x, lambda first_x_attn: self.attn(x, x, x, trg_mask))

        """
        解码器第二层子层
        把第一层子层得到的first_x和
        经过一个Multi-Head Attention处理后的second_x_attn（由first_x和Encoder的输出进行自注意力计算）
        送入第二个残差网络进行处理
        """
        second_x = self.sublayer_connection_list[1](first_x,
                                                    lambda second_x_attn: self.attn(first_x, l2r_memory, l2r_memory,
                                                                                    src_mask))

        """
        解码器第三层子层
        把经过第二层子层处理后的数据second_x与前馈神经网络送入第三个残差网络进行处理得到Decoder层的输出
        
        如果有r2l_memory数据，则还需要经过一层多头注意力计算，也就是说会有四个残差网络
        r2l_memory是让Decoder层多了一层双向编码中从右到左的编码层
        而只要三个残差网络的Decoder层只有从左到右的编码
        """
        if not r2l_memory:
            # 进行从右到左的编码，增加语义信息
            third_x = self.sublayer_connection_list[-2](second_x,
                                                        lambda third_x_attn: self.attn(second_x, r2l_memory, r2l_memory,
                                                                                       r2l_trg_mask))
            return self.sublayer_connection_list[-1](third_x, self.feed_forward)
        else:
            return self.sublayer_connection_list[-1](second_x, self.feed_forward)


class Encoder(nn.Module):
    """
    构建n层编码层
    """

    def __init__(self, n, encoder_layer):
        """
        :param n: Encoder层的层数
        :param encoder_layer: 初始化的Encoder层
        """
        super(Encoder, self).__init__()
        self.encoder_layer_list = clone_module_to_modulelist(encoder_layer, n)

    def forward(self, x, src_mask):
        """
        :param x: 输入数据
        :param src_mask: mask标志
        :return: 经过n层Encoder处理后的数据
        """
        for encoder_layer in self.encoder_layer_list:
            x = encoder_layer(x, src_mask)
        return x


class R2LDecoder(nn.Module):
    """
    n个含有R2L自注意计算的解码层，该解码层只有3个残差网络
    """

    def __init__(self, n_layers, decoder_layer):
        """
        :param n_layers: Decoder层的层数
        :param decoder_layer: 初始化的Decoder层
        """
        super(R2LDecoder, self).__init__()
        self.decoder_layer_list = clone_module_to_modulelist(decoder_layer, n_layers)

    def forward(self, x, memory, src_mask, trg_mask):
        for decoder_layer in self.decoder_layer_list:
            # 没有传入r2l_memory和r2l_trg_mask，默认值为None，即该Decoder只有3个残差网络
            x = decoder_layer(x, memory, src_mask, trg_mask)
        return x


class L2RDecoder(nn.Module):
    """
    n个含有L2R自注意计算的解码层，该解码层有4个残差网络
    """

    def __init__(self, n_layers, decoder_layer):
        """
        :param n_layers: Decoder层的层数
        :param decoder_layer: 初始化的Decoder层
        """
        super(L2RDecoder, self).__init__()
        self.decoder_layer_list = clone_module_to_modulelist(decoder_layer, n_layers)

    def forward(self, x, memory, src_mask, trg_mask, r2l_memory, r2l_trg_mask):
        for decoder_layer in self.decoder_layer_list:
            # 传入r2l_memory和r2l_trg_mask，即修改默认值，Decoder将具有4个残差网络
            x = decoder_layer(x, memory, src_mask, trg_mask, r2l_memory, r2l_trg_mask)
        return x


def sequence_mask(size):
    """
    序列掩码，解码器输入数据时掩盖后续词的位置
    :param size: 生成词个数
    :return: 右上角为False，主对角线及左下角为True的bool矩阵
    """
    attn_shape = (1, size, size)
    """
    np.triu：返回函数的上三角矩阵A，k=1得到主对角线向上平移一个距离的对角线，
    即保留右上对角线及其以上的数据，其余置为0，即a_11=0
    """
    mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return (torch.from_numpy(mask) == 0).cuda()  # 通过==0返回的是bool矩阵，即矩阵元素为bool值


def src_trg_mask(src, r2l_trg, trg, pad_idx):
    """
    :param src: 编码器的输入
    :param r2l_trg: r2l方向解码器的输入
    :param trg: l2r方向解码器的输入
    :param pad_idx: pad的索引
    :return: trg为None，返回编码器输入的掩码；trg存在，返回编码器和解码器输入的掩码
    """

    # TODO: enc_src_mask是元组，是否可以改成list，然后修改这种冗余代码
    # 通过src的长短，即视频特征向量提取的模式，判断有多少种特征向量需要进行mask
    if isinstance(src, tuple) and len(src) == 4:
        # 不同模式的视频特征向量的mask
        src_image_mask = (src[0][:, :, 0] != pad_idx).unsqueeze(1)  # 二维特征向量
        src_motion_mask = (src[1][:, :, 0] != pad_idx).unsqueeze(1)  # 三维特征向量
        src_object_mask = (src[2][:, :, 0] != pad_idx).unsqueeze(1)  # 目标检测特征向量
        src_rel_mask = (src[3][:, :, 0] != pad_idx).unsqueeze(1)  # 目标关系特征向量

        # 视频所有特征向量mask的拼接
        enc_src_mask = (src_image_mask, src_motion_mask, src_object_mask, src_rel_mask)
        dec_src_mask = src_image_mask & src_motion_mask  # 视频二维和三维特征向量mask的拼接
        src_mask = (enc_src_mask, dec_src_mask)  # 视频最终的mask
    elif isinstance(src, tuple) and len(src) == 3:
        src_image_mask = (src[0][:, :, 0] != pad_idx).unsqueeze(1)
        src_motion_mask = (src[1][:, :, 0] != pad_idx).unsqueeze(1)
        src_object_mask = (src[2][:, :, 0] != pad_idx).unsqueeze(1)

        enc_src_mask = (src_image_mask, src_motion_mask, src_object_mask)
        dec_src_mask = src_image_mask & src_motion_mask
        src_mask = (enc_src_mask, dec_src_mask)
    elif isinstance(src, tuple) and len(src) == 2:
        src_image_mask = (src[0][:, :, 0] != pad_idx).unsqueeze(1)
        src_motion_mask = (src[1][:, :, 0] != pad_idx).unsqueeze(1)

        enc_src_mask = (src_image_mask, src_motion_mask)
        dec_src_mask = src_image_mask & src_motion_mask
        src_mask = (enc_src_mask, dec_src_mask)
    else:
        # 即只有src_image_mask，即二维特征的mask
        src_mask = src_image_mask = (src[:, :, 0] != pad_idx).unsqueeze(1)

    # 判断是否需要对trg，也就是解码器的输入进行掩码
    if trg and r2l_trg:
        """
        trg_mask是填充掩码和序列掩码，&前是填充掩码，&后是通过subsequent_mask函数得到的序列掩码
        其中type_as，是为了让序列掩码和填充掩码的维度一致
        """
        trg_mask = (trg != pad_idx).unsqueeze(1) & sequence_mask(trg.size(1)).type_as(src_image_mask.data)
        # r2l_trg的填充掩码
        r2l_pad_mask = (r2l_trg != pad_idx).unsqueeze(1).type_as(src_image_mask.data)
        # r2l_trg的填充掩码和序列掩码
        r2l_trg_mask = r2l_pad_mask & sequence_mask(r2l_trg.size(1)).type_as(src_image_mask.data)
        # src_mask[batch, 1, lens]  trg_mask[batch, 1, lens]
        return src_mask, r2l_pad_mask, r2l_trg_mask, trg_mask
    else:
        return src_mask


class WordProbGenerator(nn.Module):
    """
    文本生成器，即把Decoder层的输出通过最后一层softmax层变化为词概率
    """

    def __init__(self, d_model, vocab_size):
        """
        :param d_model: 词向量维度
        :param vocab_size: 词典大小
        """
        super(WordProbGenerator, self).__init__()
        # 通过线性层的映射，映射成词典大小的维度
        self.linear = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        # 通过softmax函数对词概率做出估计
        return F.log_softmax(self.linear(x), dim=-1)


class ABDTransformer(nn.Module):
    """
    拼凑出Transformer
    """

    def __init__(self, vocab, d_feat, d_model, d_ff, n_heads, n_layers, dropout, feature_mode, device='cuda'):
        """
        :param vocab: 字典长度
        :param d_feat: per frame dimension（每帧的维度）
        :param d_model: 词向量的长度
        :param d_ff: FNN（FeedForward）第二层隐藏层输入的维度
        :param n_heads: 多头注意力时的头数
        :param n_layers: 编码器和解码器的层数
        :param dropout: drop的比率
        :param feature_mode: 提取视频特征的模式
        :param device: 是否使用gpu
        """
        super(ABDTransformer, self).__init__()
        self.vocab = vocab
        self.device = device
        self.feature_mode = feature_mode
        attn = MultiHeadAttention(n_heads, d_model, dropout)  # 多头注意力计算
        feed_forward = FeedForward(d_model, d_ff)  # 前馈神经网络

        """
        提取视频特征向量
        通过feature_mode判断d_feat提取出的维度，也就是提取了多少种信息
        共有四种特征向量信息，四种特征向量依次为：
        image_mask：二维特征向量
        motion_mask：三维特征向量
        object_mask：目标检测，分为两部分，第一部分是目标检测框的坐标，第二部分是被检测目标的特征向量
        rel_mask：是目标之间的关系特征向量
        """
        if feature_mode == 'one':
            # 使用unknown_src_embed命名的目的：提取视频一个特征向量的时候，不一定会提取什么种类的特征向量
            self.unknown_src_embed = FeatEmbedding(d_feat, d_model, dropout)
        elif feature_mode == 'two':
            self.image_src_embed = FeatEmbedding(d_feat[0], d_model, dropout)
            self.motion_src_embed = FeatEmbedding(d_feat[1], d_model, dropout)
        elif feature_mode == 'three':
            self.image_src_embed = FeatEmbedding(d_feat[0], d_model, dropout)
            self.motion_src_embed = FeatEmbedding(d_feat[1], d_model, dropout)
            self.object_src_embed = FeatEmbedding(d_feat[2], d_model, dropout)
        elif feature_mode == 'four':
            self.image_src_embed = FeatEmbedding(d_feat[0], d_model, dropout)
            self.motion_src_embed = FeatEmbedding(d_feat[1], d_model, dropout)
            self.object_src_embed = FeatEmbedding(d_feat[2], d_model, dropout)
            self.rel_src_embed = FeatEmbedding(d_feat[3], d_model, dropout)
        else:
            raise "feature_mode没有该模式，只有['one','two','three','four']四种模式"

        # 把特征向量提取成d_model维度的词向量
        self.trg_embed = WordEmbedding(vocab.n_vocabs, d_model)
        # 提取位置向量
        self.pos_embed = PositionalEncoding(d_model, dropout)
        # 编码层
        self.encoder = Encoder(n_layers, EncoderLayer(d_model, deepcopy(attn), deepcopy(feed_forward), dropout))
        """
        单向解码层
        使用deepcopy的原因：因为每个层的参数是不同的，因此通过deepcopy拷贝一份到新的内存里，避免共享参数
        """
        self.r2l_decoder = R2LDecoder(n_layers, DecoderLayer(d_model, deepcopy(attn), deepcopy(feed_forward),
                                                             sublayer_num=3, dropout=dropout))
        # 双向解码层
        self.l2r_decoder = L2RDecoder(n_layers, DecoderLayer(d_model, deepcopy(attn), deepcopy(feed_forward),
                                                             sublayer_num=4, dropout=dropout))
        # 生成单词概率分布
        self.word_prob_generator = WordProbGenerator(d_model, vocab.n_vocabs)

    def _encoder_feature_concat(self, src, feature_type, src_mask):
        """
        为接下来的encoder函数做准备，主要目的是对视频的特征向量做处理
        :param src: 特征向量
        :param feature_type: 根据视频的类别不同，使用不同的特征向量生成函数， ['image', 'motion', 'object', 'rel']
        :param src_mask: 特征向量掩码的标志
        :return: 经过处理后的视频特征向量
        """
        if feature_type == 'rel':
            # 视频的关系特征向量不需要进行位置向量
            x = self.rel_src_embed(src)  # 提取目标关系特征向量
            return self.encoder(x, src_mask)  # 送入编码器进行编码

        # 例：对于'image'，下面的调用为 self.image_src_embed(src)
        x = self.__getattribute__(f'{feature_type}_src_embed')(src)
        x = self.pos_embed(x)  # 提取视频位置特征向量
        return self.encoder(x, src_mask)  # 送入编码器进行编码

    def encode(self, src, src_mask):
        """
        对数据进行编码，此处主要目的是对不同类型的视频特征向量进行编码
        :param src: 视频的特征向量
        :param src_mask: 视频特征向量的掩码标志
        :return: 成功被编码器编码的视频特征向量
        """
        x_list = []  # 存储不同类型的视频特征向量被编码后的向量
        feature_type_list = ['image', 'motion', 'object', 'rel']  # 视频特征向量的类型
        feature_mode_dict = {'two': 2, 'three': 3, 'four': 4}  # 输入视频特征向量的种类

        if self.feature_mode == 'one':
            return self._encoder_feature_concat[src, 'unknown', src_mask]

        for i, feature_type in enumerate(feature_type_list):
            # 对于不同的feature_mode，拥有的encode的种类也不同
            if i == feature_mode_dict[self.feature_mode]:
                break
            x_list.append(self._encoder_feature_concat[src[i], feature_type, src_mask[i]])

        # TODO（灵感）：这里是否能添加一个线性变化，找出对于视频词向量更为有作用的模式和权重，这样也具有一定的解释性
        return sum(x_list)  # 对于不同feature_type提取的向量进行叠加

    def r2l_decode(self, trg, memory, src_mask, trg_mask):
        """
        对于单向编码，把视频向量转为文本向量，并且添加位置向量
        :param trg: 解码器的输入
        :param memory: 编码器的输出，也就是传给解码器的K、V
        :param src_mask: 编码器输出的掩码标志
        :param trg_mask: 解码器的掩码和单词掩码序列（看不见后面的词）
        :return:
        """
        x = self.trg_embed(trg)  # 把视频向量转为单向编码的词向量
        x = self.pos_embed(x)
        return self.r2l_decoder(x, memory, src_mask, trg_mask)

    def l2r_decode(self, trg, memory, src_mask, trg_mask, r2l_memory, r2l_trg_mask):
        """
        对于双向编码，把视频向量转为文本向量，并且添加位置向量
        :param trg: 解码器的输入
        :param memory: 编码器的输出，也就是传给解码器的K、V
        :param src_mask: 编码器输出的掩码标志
        :param trg_mask: 解码器的掩码和单词掩码序列（看不见后面的词）
        :param r2l_memory: 从右到左解码器的输出
        :param r2l_trg_mask: 从右到左解码器的输出的填充掩码和序列掩码
        :return:
        """
        x = self.trg_embed(trg)  # 把视频向量转为双向编码的词向量
        x = self.pos_embed(x)
        return self.l2r_decoder(x, memory, src_mask, trg_mask, r2l_memory, r2l_trg_mask)

    def forward(self, src, r2l_trg, trg, mask):
        """
        :param src: 编码器的输入
        :param r2l_trg: 从右到左解码器的输入
        :param trg: 从左到右解码器的输入
        :param mask: mask标志
        :return: 从右到左解码器和从左到右解码器的输出词概率分布
        """
        # mask应该是个元组，其中src_mask是包括了不同特征模式的mask的元组
        if len(mask) == 4:
            src_mask, r2l_pad_mask, r2l_trg_mask, trg_mask = mask
        else:
            raise "mask返回的是不带有解码器输入掩码的掩码元组，确认src_trg_mask()函数的参数"

        if self.feature_mode == 'one':
            # 得到视频encode后的输出
            encoding_output = self.encode(src, src_mask)
            # 视频特征单向编码后送入三层残差网络的解码器后得到的输出
            r2l_output = self.r2l_decode(r2l_trg, encoding_output, src_mask, r2l_trg_mask)
            # 视频特征双向编码后送入四层残差网络的解码器后得到的输出
            l2r_output = self.l2r_decode(trg, encoding_output, src_mask, trg_mask, r2l_output, r2l_pad_mask)
        elif self.feature_mode == 'two' or 'three' or 'four':
            # enc_src_mask是视频所有类型的特征掩码；dec_src_mask是二维和三维类型的特征的掩码
            enc_src_mask, dec_src_mask = src_mask
            # 视频特征向量模式的不同，对应不同的掩码方式
            encoding_output = self.encode(src, enc_src_mask)
            r2l_output = self.r2l_decode(r2l_trg, encoding_output, dec_src_mask, r2l_trg_mask)
            l2r_output = self.l2r_decode(trg, encoding_output, dec_src_mask, trg_mask, r2l_output, r2l_pad_mask)
        else:
            raise "没有这种feature_mode，只有['one','two','three','four']"

        # 预测解码词概率分布
        r2l_pred = self.word_prob_generator(r2l_output)
        l2r_pred = self.word_prob_generator(l2r_output)

        return r2l_pred, l2r_pred

    def greedy_decode(self, batch_size, src_mask, memory, max_len):
        """
        针对r2l的解码单词生成，贪婪解码，每次按照最大概率的词作为候选词
        :param batch_size: 每次送入的数据的数量
        :param src_mask: 编码器输入数据的掩码标志
        :param memory: 编码器的输出
        :param max_len: 最大的迭代次数，即生成单词数
        :return: 返回的r2l_hidden，即未来送入l2r中的r2l_memory；output是r2l层的预测输出
        """
        eos_idx = self.vocab.word2idx['']  # 符号，表示结束输出的标志
        with torch.no_grad():
            # 构建一个batch_size大小的向量存储eos标志，作为初始化的output
            output = torch.ones(batch_size, 1).fill_(eos_idx).long().cuda()
            # 迭代生成最终输出
            for i in range(max_len + 2 - 1):
                # 构建解码器输入的序列掩码，掩盖后续的词
                trg_mask = sequence_mask(output.size(1))
                # 把初始化的输出和编码器的输出进行解码输出
                dec_out = self.r2l_decode(output, memory, src_mask, trg_mask)  # batch, len, d_model
                r2l_hidden = dec_out
                # 按照最大概率的词作为候选词
                pred = self.word_prob_generator(dec_out)  # batch, len, n_vocabs
                next_word = pred[:, -1].max(dim=-1)[1].unsqueeze(1)  # pred[:, -1]([batch, n_vocabs])
                output = torch.cat([output, next_word], dim=-1)  # 拼接预测单词送入解码器解码

        # 返回的r2l_hidden，即未来送入l2r中的r2l_memory；output是r2l层的预测输出
        return r2l_hidden, output

    def r2l_beam_search_decode(self, batch_size, src, src_mask, model_encodings, beam_size, max_len):
        """
        Beam Search算法可以参考：https://www.cnblogs.com/nickchen121/p/15499576.html
        在每生成一个单词的时间步上，不是只保留当前分数最高的1个输出，而是保留num_beams个。
        当num_beams=1时集束搜索就退化成了贪心搜索，也就是上述的greedy_decode。
        :param batch_size: 一次送入数据的大小
        :param src: 编码器的输入
        :param src_mask: 编码器输入的掩码
        :param model_encodings:
        :param beam_size:
        :param max_len: 最大迭代数，即生成单词数
        :return:
        """
        # batch_size = src.shape[0]
        end_symbol = self.vocab.word2idx['']  # 结束符号
        start_symbol = self.vocab.word2idx['']  # 开始符号
        r2l_output = None  # r2l解码器的输出

        r2l_outputs = None

        # 1.1 Setup Src
        # src has shape (batch_size, sent_len)
        # src_mask has shape (batch_size, 1, sent_len)
        # src_mask = (src[:, :, 0] != self.vocab.word2idx['']).unsqueeze(-2)  # TODO Untested
        # model_encodings has shape (batch_size, sentence_len, d_model)
        # model_encodings = self.encode(src, src_mask)

        # 1.2 Setup Tgt Hypothesis Tracking
        # hypothesis is List(4 bt)[(cur beam_sz, dec_sent_len)], init: List(4 bt)[(1 init_beam_sz, dec_sent_len)]
        # hypotheses[i] is shape (cur beam_sz, dec_sent_len)
        hypotheses = [copy.deepcopy(torch.full((1, 1), start_symbol, dtype=torch.long,
                                               device=self.device)) for _ in range(batch_size)]
        # List after init: List 4 bt of List of len max_len_completed, init: List of len 4 bt of []
        completed_hypotheses = [copy.deepcopy([]) for _ in range(batch_size)]
        # List len batch_sz of shape (cur beam_sz), init: List(4 bt)[(1 init_beam_sz)]
        # hyp_scores[i] is shape (cur beam_sz)
        hyp_scores = [copy.deepcopy(torch.full((1,), 0, dtype=torch.float, device=self.device))
                      for _ in range(batch_size)]  # probs are log_probs must be init at 0.

        # 2. Iterate: Generate one char at a time until maxlen
        for _ in range(max_len + 1):
            if all([len(completed_hypotheses[i]) == beam_size for i in range(batch_size)]):
                break

            """
            2.1 Setup the batch. Since we use beam search, each batch has a variable number (called cur_beam_size)
            between 0 and beam_size of hypotheses live at any moment. We decode all hypotheses for all batches at
            the same time, so we must copy the src_encodings, src_mask, etc the appropriate number fo times for
            the number of hypotheses for each example. We keep track of the number of live hypotheses for each example.
            We run all hypotheses for all examples together through the decoder and log-softmax,
            and then use `torch.split` to get the appropriate number of hypotheses for each example in the end.
            """
            cur_beam_sizes, last_tokens, model_encodings_l, src_mask_l = [], [], [], []
            for i in range(batch_size):
                if hypotheses[i] is None:
                    cur_beam_sizes += [0]
                    continue
                cur_beam_size, decoded_len = hypotheses[i].shape
                cur_beam_sizes += [cur_beam_size]
                last_tokens += [hypotheses[i]]
                model_encodings_l += [model_encodings[i:i + 1]] * cur_beam_size
                src_mask_l += [src_mask[i:i + 1]] * cur_beam_size
            # shape (sum(4 bt * cur_beam_sz_i), 1 dec_sent_len, 128 d_model)
            model_encodings_cur = torch.cat(model_encodings_l, dim=0)
            src_mask_cur = torch.cat(src_mask_l, dim=0)
            y_tm1 = torch.cat(last_tokens, dim=0)
            # shape (sum(4 bt * cur_beam_sz_i), 1 dec_sent_len, 128 d_model)
            if self.feature_mode == 'one':
                out = self.r2l_decode(Variable(y_tm1).to(self.device), model_encodings_cur, src_mask_cur,
                                      Variable(sequence_mask(y_tm1.size(-1)).type_as(src.data)).to(self.device))
            elif self.feature_mode == 'two' or 'three' or 'four':
                out = self.r2l_decode(Variable(y_tm1).to(self.device), model_encodings_cur, src_mask_cur,
                                      Variable(sequence_mask(y_tm1.size(-1)).type_as(src[0].data)).to(self.device))
            else:
                raise "out为None"

            r2l_output = out

            # shape (sum(4 bt * cur_beam_sz_i), 1 dec_sent_len, 50002 vocab_sz)
            log_prob = self.word_prob_generator(out[:, -1, :]).unsqueeze(1)
            # shape (sum(4 bt * cur_beam_sz_i), 1 dec_sent_len, 50002 vocab_sz)
            _, decoded_len, vocab_sz = log_prob.shape
            # log_prob = log_prob.reshape(batch_size, cur_beam_size, decoded_len, vocab_sz)
            # shape List(4 bt)[(cur_beam_sz_i, dec_sent_len, 50002 vocab_sz)]
            # log_prob[i] is (cur_beam_sz_i, dec_sent_len, 50002 vocab_sz)
            log_prob = torch.split(log_prob, cur_beam_sizes, dim=0)

            """
            2.2 Now we process each example in the batch. 
            Note that the example may have already finished processing before
            other examples (no more hypotheses to try), in which case we continue
            """
            new_hypotheses, new_hyp_scores = [], []
            for i in range(batch_size):
                if hypotheses[i] is None or len(completed_hypotheses[i]) >= beam_size:
                    new_hypotheses += [None]
                    new_hyp_scores += [None]
                    continue

                """
                2.2.1 We compute the cumulative scores for each live hypotheses for the example
                hyp_scores is the old scores for the previous stage, and `log_prob` are the new probs for
                this stage. Since they are log probs, we sum them instead of multiplying them.
                The .view(-1) forces all the hypotheses into one dimension. The shape of this dimension is
                cur_beam_sz * vocab_sz (ex: 5 * 50002).
                So after getting the topk from it, 
                we can recover the generating sentence and the next word using: ix // vocab_sz, ix % vocab_sz.
                """
                cur_beam_sz_i, dec_sent_len, vocab_sz = log_prob[i].shape
                # shape (vocab_sz,)
                cumulative_hyp_scores_i = (hyp_scores[i].unsqueeze(-1).unsqueeze(-1)
                                           .expand((cur_beam_sz_i, 1, vocab_sz)) + log_prob[i]).view(-1)

                """
                2.2.2 We get the topk values in cumulative_hyp_scores_i and compute the current (generating) sentence
                and the next word using: ix // vocab_sz, ix % vocab_sz.
                """

                # shape (cur_beam_sz,)
                live_hyp_num_i = beam_size - len(completed_hypotheses[i])
                # shape (cur_beam_sz,). Vals are between 0 and 50002 vocab_sz
                top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(cumulative_hyp_scores_i, k=live_hyp_num_i)
                """
                shape (cur_beam_sz,). prev_hyp_ids vals are 0 <= val < cur_beam_sz. 
                hyp_word_ids vals are 0 <= val < vocab_len
                """
                prev_hyp_ids = top_cand_hyp_pos // self.vocab.n_vocabs
                hyp_word_ids = top_cand_hyp_pos % self.vocab.n_vocabs

                """
                2.2.3 For each of the topk words, we append the new word to the current (generating) sentence
                We add this to new_hypotheses_i and add its corresponding total score to new_hyp_scores_i
                """

                # Removed live_hyp_ids_i, which is used in the LSTM decoder to track live hypothesis ids
                new_hypotheses_i, new_hyp_scores_i = [], []
                for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids,
                                                                        top_cand_hyp_scores):
                    prev_hyp_id, hyp_word_id, cand_new_hyp_score = \
                        prev_hyp_id.item(), hyp_word_id.item(), cand_new_hyp_score.item()

                    new_hyp_sent = torch.cat(
                        (hypotheses[i][prev_hyp_id], torch.tensor([hyp_word_id], device=self.device)))
                    if hyp_word_id == end_symbol:
                        completed_hypotheses[i].append(Hypothesis(
                            value=[self.vocab.idx2word[a.item()] for a in new_hyp_sent[1:-1]],
                            score=cand_new_hyp_score))
                    else:
                        new_hypotheses_i.append(new_hyp_sent.unsqueeze(-1))
                        new_hyp_scores_i.append(cand_new_hyp_score)

                """
                2.2.4 We may find that the hypotheses_i for some example in the batch
                is empty - we have fully processed that example. We use None as a sentinel in this case.
                Above, the loops gracefully handle None examples.
                """
                if len(new_hypotheses_i) > 0:
                    hypotheses_i = torch.cat(new_hypotheses_i, dim=-1).transpose(0, -1).to(self.device)
                    hyp_scores_i = torch.tensor(new_hyp_scores_i, dtype=torch.float, device=self.device)
                else:
                    hypotheses_i, hyp_scores_i = None, None
                new_hypotheses += [hypotheses_i]
                new_hyp_scores += [hyp_scores_i]
            # print(new_hypotheses, new_hyp_scores)
            hypotheses, hyp_scores = new_hypotheses, new_hyp_scores

        """
        2.3 Finally, we do some postprocessing to get our final generated candidate sentences.
        Sometimes, we may get to max_len of a sentence and still not generate the  end token.
        In this case, the partial sentence we have generated will not be added to the completed_hypotheses
        automatically, and we have to manually add it in. We add in as many as necessary so that there are
        `beam_size` completed hypotheses for each example.
        Finally, we sort each completed hypothesis by score.
        """
        for i in range(batch_size):
            hyps_to_add = beam_size - len(completed_hypotheses[i])
            if hyps_to_add > 0:
                scores, ix = torch.topk(hyp_scores[i], k=hyps_to_add)
                for score, id_ in zip(scores, ix):
                    completed_hypotheses[i].append(Hypothesis(
                        value=[self.vocab.idx2word[a.item()] for a in hypotheses[i][id_][1:]],
                        score=score))
            completed_hypotheses[i].sort(key=lambda hyp: hyp.score, reverse=True)
        return r2l_output, completed_hypotheses

    def beam_search_decode(self, src, beam_size, max_len):
        """
        An Implementation of Beam Search for the Transformer Model.
        Beam search is performed in a batched manner. Each example in a batch generates `beam_size` hypotheses.
        We return a list (len: batch_size) of list (len: beam_size) of Hypothesis,
        which contain our output decoded sentence sand their scores.
        :param src: shape (sent_len, batch_size). Each val is 0 < val < len(vocab_dec). The input tokens to the decoder.
        :param max_len: the maximum length to decode
        :param beam_size: the beam size to use
        :return completed_hypotheses: A List of length batch_size,
        each containing a List of beam_size Hypothesis objects.Hypothesis is a named Tuple,
        its first entry is "value" and is a List of strings which contains the translated word
        (one string is one word token).
        The second entry is "score" and it is the log-prob score for this translated sentence.
        Note: Below I note "4 bt", "5 beam_size" as the shapes of objects. 4, 5 are default values.
         Actual values may differ.
        """

        # 1. Setup
        start_symbol = self.vocab.word2idx['']
        end_symbol = self.vocab.word2idx['']

        # 1.1 Setup Src
        # src has shape (batch_size, sent_len)
        # src_mask has shape (batch_size, 1, sent_len)
        # src_mask = (src[:, :, 0] != self.vocab.word2idx['']).unsqueeze(-2)  # TODO Untested
        src_mask = src_trg_mask(src, r2l_trg=None, trg=None, pad_idx=self.vocab.word2idx[''])
        # model_encodings has shape (batch_size, sentence_len, d_model)
        if self.feature_mode == 'one':
            batch_size = src.shape[0]
            dec_src_mask = None
            model_encodings = self.encode(src, src_mask)
            r2l_memory, r2l_completed_hypotheses = self.r2l_beam_search_decode(batch_size, src, src_mask,
                                                                               model_encodings=model_encodings,
                                                                               beam_size=1, max_len=max_len)
        elif self.feature_mode == 'two' or 'three' or 'four':
            batch_size = src[0].shape[0]
            enc_src_mask = src_mask[0]
            dec_src_mask = src_mask[1]
            model_encodings = self.encode(src, enc_src_mask)
            r2l_memory, r2l_completed_hypotheses = self.r2l_beam_search_decode(batch_size, src, dec_src_mask,
                                                                               model_encodings=model_encodings,
                                                                               beam_size=1, max_len=max_len)
        else:
            raise "batch_size为None"

        """
        1.2 Setup r2l target output
        r2l_memory, r2l_completed_hypotheses = self.r2l_beam_search_decode(batch_size, src, src_mask,
                                                                           model_encodings=model_encodings,
                                                                           beam_size=1, max_len=max_len)
        r2l_memory, r2l_completed_hypotheses = self.greedy_decode(batch_size, src_mask, model_encodings, max_len)
        beam_r2l_memory = [copy.deepcopy(r2l_memory) for _ in range(beam_size)]
        
        1.3 Setup Tgt Hypothesis Tracking
        """

        # hypothesis is List(4 bt)[(cur beam_sz, dec_sent_len)], init: List(4 bt)[(1 init_beam_sz, dec_sent_len)]
        # hypotheses[i] is shape (cur beam_sz, dec_sent_len)
        hypotheses = [copy.deepcopy(torch.full((1, 1), start_symbol, dtype=torch.long,
                                               device=self.device)) for _ in range(batch_size)]
        # List after init: List 4 bt of List of len max_len_completed, init: List of len 4 bt of []
        completed_hypotheses = [copy.deepcopy([]) for _ in range(batch_size)]
        # List len batch_sz of shape (cur beam_sz), init: List(4 bt)[(1 init_beam_sz)]
        # hyp_scores[i] is shape (cur beam_sz)
        hyp_scores = [copy.deepcopy(torch.full((1,), 0, dtype=torch.float, device=self.device))
                      for _ in range(batch_size)]  # probs are log_probs must be init at 0.

        # 2. Iterate: Generate one char at a time until maxlen
        for _ in range(max_len + 1):
            if all([len(completed_hypotheses[i]) == beam_size for i in range(batch_size)]):
                break

            """
            2.1 Setup the batch. Since we use beam search, each batch has a variable number (called cur_beam_size)
            between 0 and beam_size of hypotheses live at any moment. We decode all hypotheses for all batches at
            the same time, so we must copy the src_encodings, src_mask, etc the appropriate number fo times for
            the number of hypotheses for each example. We keep track of the number of live hypotheses for each example.
            We run all hypotheses for all examples together through the decoder and log-softmax,
            and then use `torch.split` to get the appropriate number of hypotheses for each example in the end.
            """

            cur_beam_sizes, last_tokens, model_encodings_l, src_mask_l, r2l_memory_l = [], [], [], [], []
            for i in range(batch_size):
                if hypotheses[i] is None:
                    cur_beam_sizes += [0]
                    continue
                cur_beam_size, decoded_len = hypotheses[i].shape
                cur_beam_sizes += [cur_beam_size]
                last_tokens += [hypotheses[i]]
                model_encodings_l += [model_encodings[i:i + 1]] * cur_beam_size
                if self.feature_mode == 'one':
                    src_mask_l += [src_mask[i:i + 1]] * cur_beam_size
                elif dec_src_mask and (self.feature_mode == 'two' or 'three' or 'four'):
                    src_mask_l += [dec_src_mask[i:i + 1]] * cur_beam_size
                r2l_memory_l += [r2l_memory[i: i + 1]] * cur_beam_size
            # shape (sum(4 bt * cur_beam_sz_i), 1 dec_sent_len, 128 d_model)
            model_encodings_cur = torch.cat(model_encodings_l, dim=0)
            src_mask_cur = torch.cat(src_mask_l, dim=0)
            y_tm1 = torch.cat(last_tokens, dim=0)
            r2l_memory_cur = torch.cat(r2l_memory_l, dim=0)
            # shape (sum(4 bt * cur_beam_sz_i), 1 dec_sent_len, 128 d_model)
            if self.feature_mode == 'one':
                out = self.l2r_decode(Variable(y_tm1).to(self.device), model_encodings_cur, src_mask_cur,
                                      Variable(sequence_mask(y_tm1.size(-1)).type_as(src.data)).to(self.device),
                                      r2l_memory_cur, r2l_trg_mask=None)
            elif self.feature_mode == 'two' or 'three' or 'four':
                out = self.l2r_decode(Variable(y_tm1).to(self.device), model_encodings_cur, src_mask_cur,
                                      Variable(sequence_mask(y_tm1.size(-1)).type_as(src[0].data)).to(self.device),
                                      r2l_memory_cur, r2l_trg_mask=None)
            else:
                raise "out为None"

            # shape (sum(4 bt * cur_beam_sz_i), 1 dec_sent_len, 50002 vocab_sz)
            log_prob = self.word_prob_generator(out[:, -1, :]).unsqueeze(1)
            # shape (sum(4 bt * cur_beam_sz_i), 1 dec_sent_len, 50002 vocab_sz)
            _, decoded_len, vocab_sz = log_prob.shape
            # log_prob = log_prob.reshape(batch_size, cur_beam_size, decoded_len, vocab_sz)
            # shape List(4 bt)[(cur_beam_sz_i, dec_sent_len, 50002 vocab_sz)]
            # log_prob[i] is (cur_beam_sz_i, dec_sent_len, 50002 vocab_sz)
            log_prob = torch.split(log_prob, cur_beam_sizes, dim=0)

            """
            2.2 Now we process each example in the batch.
            Note that the example may have already finished processing before.
            other examples (no more hypotheses to try), in which case we continue
            """
            new_hypotheses, new_hyp_scores = [], []
            for i in range(batch_size):
                if hypotheses[i] is None or len(completed_hypotheses[i]) >= beam_size:
                    new_hypotheses += [None]
                    new_hyp_scores += [None]
                    continue

                # 2.2.1 We compute the cumulative scores for each live hypotheses for the example
                # hyp_scores is the old scores for the previous stage, and `log_prob` are the new probs for
                # this stage. Since they are log probs, we sum them instead of multiplying them.
                # The .view(-1) forces all the hypotheses into one dimension. The shape of this dimension is
                # cur_beam_sz * vocab_sz (ex: 5 * 50002). So after getting the topk from it, we can recover the
                # generating sentence and the next word using: ix // vocab_sz, ix % vocab_sz.
                cur_beam_sz_i, dec_sent_len, vocab_sz = log_prob[i].shape
                "shape (vocab_sz,)"
                cumulative_hyp_scores_i = (hyp_scores[i].unsqueeze(-1).unsqueeze(-1)
                                           .expand((cur_beam_sz_i, 1, vocab_sz)) + log_prob[i]).view(-1)

                """
                2.2.2 We get the topk values in cumulative_hyp_scores_i and compute the current (generating) sentence
                and the next word using: ix // vocab_sz, ix % vocab_sz.
                """
                # shape (cur_beam_sz,)
                live_hyp_num_i = beam_size - len(completed_hypotheses[i])
                # shape (cur_beam_sz,). Vals are between 0 and 50002 vocab_sz
                top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(cumulative_hyp_scores_i, k=live_hyp_num_i)
                """
                shape (cur_beam_sz,). prev_hyp_ids vals are 0 <= val < cur_beam_sz. 
                hyp_word_ids vals are 0 <= val < vocab_len
                """
                prev_hyp_ids = top_cand_hyp_pos // self.vocab.n_vocabs
                hyp_word_ids = top_cand_hyp_pos % self.vocab.n_vocabs

                """
                2.2.3 For each of the topk words, we append the new word to the current (generating) sentence
                We add this to new_hypotheses_i and add its corresponding total score to new_hyp_scores_i
                """
                # Removed live_hyp_ids_i, which is used in the LSTM decoder to track live hypothesis ids
                new_hypotheses_i, new_hyp_scores_i = [], []
                for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids,
                                                                        top_cand_hyp_scores):
                    prev_hyp_id, hyp_word_id, cand_new_hyp_score = \
                        prev_hyp_id.item(), hyp_word_id.item(), cand_new_hyp_score.item()

                    new_hyp_sent = torch.cat(
                        (hypotheses[i][prev_hyp_id], torch.tensor([hyp_word_id], device=self.device)))
                    if hyp_word_id == end_symbol:
                        completed_hypotheses[i].append(Hypothesis(
                            value=[self.vocab.idx2word[a.item()] for a in new_hyp_sent[1:-1]],
                            score=cand_new_hyp_score))
                    else:
                        new_hypotheses_i.append(new_hyp_sent.unsqueeze(-1))
                        new_hyp_scores_i.append(cand_new_hyp_score)

                # 2.2.4 We may find that the hypotheses_i for some example in the batch
                # is empty - we have fully processed that example. We use None as a sentinel in this case.
                # Above, the loops gracefully handle None examples.
                if len(new_hypotheses_i) > 0:
                    hypotheses_i = torch.cat(new_hypotheses_i, dim=-1).transpose(0, -1).to(self.device)
                    hyp_scores_i = torch.tensor(new_hyp_scores_i, dtype=torch.float, device=self.device)
                else:
                    hypotheses_i, hyp_scores_i = None, None
                new_hypotheses += [hypotheses_i]
                new_hyp_scores += [hyp_scores_i]
            # print(new_hypotheses, new_hyp_scores)
            hypotheses, hyp_scores = new_hypotheses, new_hyp_scores

        """
        2.3 Finally, we do some postprocessing to get our final generated candidate sentences.
        Sometimes, we may get to max_len of a sentence and still not generate the  end token.
        In this case, the partial sentence we have generated will not be added to the completed_hypotheses
        automatically, and we have to manually add it in. We add in as many as necessary so that there are
        `beam_size` completed hypotheses for each example.
        Finally, we sort each completed hypothesis by score.
        """
        for i in range(batch_size):
            hyps_to_add = beam_size - len(completed_hypotheses[i])
            if hyps_to_add > 0:
                scores, ix = torch.topk(hyp_scores[i], k=hyps_to_add)
                for score, id_ in zip(scores, ix):
                    completed_hypotheses[i].append(Hypothesis(
                        value=[self.vocab.idx2word[a.item()] for a in hypotheses[i][id_][1:]],
                        score=score))
            completed_hypotheses[i].sort(key=lambda hyp: hyp.score, reverse=True)
        # print('completed_hypotheses', completed_hypotheses)
        return r2l_completed_hypotheses, completed_hypotheses

参考链接：

博客配套视频链接: https://space.bilibili.com/383551518?spm_id_from=333.1007.0.0 b 站直接看

配套 github 链接：https://github.com/nickchen121/Pre-training-language-model

配套博客链接：https://www.cnblogs.com/nickchen121/p/15105048.html

注释版本代码地址：https://www.cnblogs.com/nickchen121/p/16518613.html

AIGC视觉生成革命：文生图、图生图与视频生成垂直模型发展全景报告（2025） Liudef06小白 AIGC 人工智能 AI作画语言模型
一、引言：从实验工具到产业引擎的跃迁人工智能生成内容（AIGC）技术正经历从文本向多模态的范式转移。2023-2025年间，文生图、图生图与视频生成垂直模型逐步跨越技术奇点，从实验室玩具进化为工业化生产力工具。这一进程的核心驱动力在于架构创新、数据优化与场景深耕的三重突破：扩散模型与Transformer的融合催生了更高保真度的图像生成；十亿级多模态数据训练解决了复杂语义理解难题；而面向影视、电商
[论文阅读] 人工智能 | 读懂Meta-Fair：让LLM摆脱偏见的自动化测试新方法张较瘦_ 前沿技术论文阅读人工智能
读懂Meta-Fair：让LLM摆脱偏见的自动化测试新方法论文标题：Meta-Fair:AI-AssistedFairnessTestingofLargeLanguageModelsarXiv:2507.02533Meta-Fair:AI-AssistedFairnessTestingofLargeLanguageModelsMiguelRomero-Arjona,JoséA.Parejo,Jua
深度 |AI高质量数据集交易爆发式增长数智前沿数字化转型人工智能数据集
AI产业从通用模型向行业垂直应用快速融合下沉的阶段演进，人工智能三大基本要素之一数据，面临的高质量数据不足问题却凸显。财联社记者最新从业内获悉，目前各大模型企业迫切希望获得更多更好的高质量数据集，需求集中于头部企业行业知识底座构建，人工智能高质量数据集的需求量、交易量激增，已成为数据流通最活跃的领域。不过，高质量数据集的建设、流通环节均面临诸多问题，目前数据交易所并非模型语料最主要的采购途径。需求
轻量化分布式AGI架构：基于区块链构建终端神经元节点的互联网智脑探客木木夕分布式 agi 人工智能架构区块链
在2025年的技术发展背景下，轻量化分布式AGI架构正成为人工智能领域的重要突破方向。通过将终端设备转化为神经元节点，结合区块链技术构建去中心化的互联网智脑，不仅能够突破传统AGI开发的算力瓶颈，还能实现数据安全共享与价值分配。**这一架构将重塑人工智能的发展范式，使AGI能力从中心化实验室扩散至全球终端设备网络，最终形成一个去中心化、自演进、高可用的互联网级智能系统**。研究显示，通过知识密度提
写测试太烦？Copilot + Jest 让你 3 分钟搞定单元测试
网罗开发（小红书、快手、视频号同名）大家好，我是展菲，目前在上市企业从事人工智能项目研发管理工作，平时热衷于分享各种编程领域的软硬技能知识以及前沿技术，包括iOS、前端、HarmonyOS、Java、Python等方向。在移动端开发、鸿蒙开发、物联网、嵌入式、云原生、开源等领域有深厚造诣。图书作者：《ESP32-C3物联网工程开发实战》图书作者：《SwiftUI入门，进阶与实战》超级个体：CO
Python 训练营打卡 Day 46 2401_86382089 Python打卡 python
通道注意力一、什么是注意力注意力机制是一种让模型学会「选择性关注重要信息」的特征提取器，就像人类视觉会自动忽略背景，聚焦于图片中的主体（如猫、汽车）。transformer中的叫做自注意力机制，他是一种自己学习自己的机制，他可以自动学习到图片中的主体，并忽略背景。我们现在说的很多模块，比如通道注意力、空间注意力、通道注意力等等，都是基于自注意力机制的。从数学角度看，注意力机制是对输入特征进行加权求
Python编程电子书：从基础到实践王奥雷
本文还有配套的精品资源，点击获取简介：Python电子书汇集了基础语法、面向对象编程、标准及第三方库使用、文件操作、网络编程、并发编程、单元测试与调试、Python2与Python3的区别等核心知识点。通过实例和项目案例，帮助读者在Web开发、数据分析、人工智能等应用领域提升编程技能，跟上Python的技术进步。1.Python基础语法介绍Python作为一种高级编程语言，其易读性和简洁的语法使其
开源模型应用落地-OpenAI Agents SDK-集成MCP与Qwen3-8B模型的创新应用探索（七）开源技术探险家开源模型-实际应用落地开源 python ai 人工智能
一、前言在人工智能技术飞速发展的今天，如何将先进的模型和技术无缝结合，成为推动行业变革的关键。OpenAIAgents通过集成模型上下文协议（MCP）和阿里巴巴推出的Qwen3-8B模型，正开启一场智能应用的革命。这种创新的结合不仅提升了AI代理与外部工具之间的通信能力，还在多模态任务处理、个性化服务等领域展现出巨大潜力。本文将深入探讨这一技术组合的实际应用场景，揭示其在改善客户体验和提升运营效率
开源模型应用落地-OpenAI Agents SDK-集成Qwen3-8B-探索output_guardrail的创意应用（六）开源技术探险家开源模型-实际应用落地开源 python ai 人工智能
一、前言随着人工智能技术的迅猛发展，大语言模型（LLM）在各行各业的应用日益广泛。然而，模型生成的内容是否安全、合规、符合用户预期，成为开发者和企业不可忽视的问题。为此，OutputGuardrail应运而生，作为一种关键的安全机制，它在模型生成结果之后进行内容审核与过滤，确保输出不偏离道德、法律和业务规范。通过检测不当的内容，不仅提升了AI系统的可信度，也为构建更加稳健和负责任的人工智能应用提供
什么是深度学习框架中的计算图？杰瑞学AI Computer knowledge NLP/LLMs AI/AGI 深度学习人工智能 pytorch
在深度学习框架中，计算图是核心的数据结构和抽象概念，它用来表示和定义深度学习模型的计算过程。我们可以把它想象成一个描述数学运算如何组合和执行的有向图。以下是计算图的关键要素和作用：节点：代表操作或变量。操作：数学运算，如加法(+)、乘法(*)、矩阵乘法(matmul)、激活函数(ReLU,sigmoid)、卷积(conv2d)、损失函数(cross_entropy)等。变量：通常是张量，即存储数据
开源模型应用落地-让AI更懂你的每一次交互-Mem0集成Qdrant、Neo4j与Streamlit的创新实践（四）开源技术探险家开源模型-实际应用落地 neo4j 开源人工智能语言模型
一、前言在人工智能迅速发展的今天，如何让AI系统更懂“你”？答案或许藏在个性化的记忆管理之中。Mem0作为一个开源的记忆管理系统，正致力于为AI赋予长期记忆与个性化服务能力。通过结合高性能向量数据库Qdrant、图数据库Neo4j的强大关系分析能力以及Streamlit的高效可视化交互，我们可以打造出一个既能存储用户历史行为、又能实时推理并展示结果的智能记忆助手。本文将带您一步步探索这一技术组合的
Spatie Laravel-Data 数据转换器深度解析倪俪珍Phineas
SpatieLaravel-Data数据转换器深度解析laravel-dataPowerfuldataobjectsforLaravel项目地址:https://gitcode.com/gh_mirrors/la/laravel-data什么是数据转换器在SpatieLaravel-Data项目中，数据转换器(Transformers)扮演着将复杂数据类型转换为简单类型的关键角色。当我们需要将数据
【优秀文章】7月优秀文章推荐
优秀文章智能自主运动体与人工智能技术——环境感知、SLAM定位、路径规划、运动控制、多智能体协同作者：fpga和matlabC++之红黑树认识与实现作者：zzh_zao【手把手带你刷好题】–C语言基础编程题(十)作者：草莓熊Lotso飞算JavaAI：从“码农”到“代码指挥官”的终极进化论作者：可涵不会debug前端网页开发学习（HTML+CSS+JS）有这一篇就够！作者：一颗小谷粒
蛋白质结构预测/功能注释/交互识别/按需设计，中国海洋大学张树刚团队直击蛋白质智能计算核心任务 hyperai
蛋白质作为生命活动的主要承担者，在人体生理功能中扮演关键角色。然而传统研究面临结构解析成本高昂、功能注释严重滞后、新型蛋白质设计效率低下等挑战。近年来，生命科学对蛋白质复杂特性解析的需求日益迫切，大数据、深度学习、多模态计算等技术的突破性发展，为构建蛋白质智能计算体系提供了全新的发展契机。蛋白质智能计算体系的构建，使得蛋白质在大规模功能注释、交互预测及三维结构建模等领域取得显著成果，为药物发现与生
【心灵鸡汤】深度学习技能形成树：从零基础到AI专家的成长路径全解析智算菩萨人工智能深度学习
引言：技能树的生长哲学在这个人工智能浪潮汹涌的时代，深度学习犹如一棵参天大树，其根系深深扎入数学与计算科学的沃土，主干挺拔地承载着机器学习的核心理念，而枝叶则繁茂地延伸至计算机视觉、自然语言处理、强化学习等各个应用领域。对于初入此领域的新手而言，理解这棵技能树的生长规律，掌握其形成过程中的关键节点和发展阶段，将直接决定其在人工智能道路上能够走多远、攀多高。技能树的概念源于游戏设计，但在学习深度学习
深度模型训练，加速数据读取遇到显卡跑不满的问题不是吧这都有重名遇到的问题 llama 人工智能 LLM python
实测在pytorch的dataloader中使用prefetch_factor参数的时候，如果数据在机械硬盘上显卡始终是跑不满的，瓶颈在数据预加载速度上，当数据放在固态硬盘的时候就可以跑满。问题排查过程：一直在跑模型，但是数据量比较大，之前有段时间还是比较头疼显卡跑不满的。后来直接用钞能力，加了内存条，将数据缓存后一次性读到内存中终于可以跑满了，然后后面就一直没管这个了，唯一的缺点就是每次开始训练
yolov5训练失败总结 BTU_YC 深度学习 python pytorch
yolov5训练失败总结版本原因：在进行训练时，出现如下报错：UserWarning:Detectedcalloflr_scheduler.step()beforeoptimizer.step().InPyTorch1.1.0andlater,youshouldcallthemintheoppositeorder:optimizer.step()beforelr_scheduler.step().
MongoDB + Voyage AI 详解：重塑数据库与AI的协同范式 csdn_tom_168 NoSQL 数据库 mongodb 人工智能 AI
MongoDB+VoyageAI详解：重塑数据库与AI的协同范式2025年2月，MongoDB官方宣布收购VoyageAI，这一举措标志着数据库与人工智能技术的深度融合迈入新阶段。通过整合VoyageAI的先进AI检索与嵌入模型能力，MongoDB旨在重新定义AI时代的数据库架构，为企业构建智能应用提供端到端的数据基础设施。一、收购背景与技术战略1.行业趋势驱动AI数据挑战：随着生成式AI与大语言
HarmonyOS5.0仓颉引擎与盘古大模型：个性化作业批改系统架构设计与实现 H老师带你学鸿蒙系统架构 HarmonyOS5.0 鸿蒙华为仓颉教育
人工智能与边缘计算的融合正在重塑教育评价体系。本文将展示如何基于HarmonyOS5.0仓颉并发引擎和盘古大模型，构建新一代智能作业批改系统。系统架构全景graphTDA[学生端设备]-->|提交作业|B[仓颉边缘处理]B-->C[盘古大模型分析]C-->D[个性化反馈生成]D-->E[学生终端]D-->F[教师仪表盘]subgraphHarmonyOS分布式系统B-->|设备协同|G[教室平板集
阿里云瑶池数据库 Data Agent for Meta 正式发布，让 AI 更懂你的业务！数据库观点资讯人工智能
背景随着生成式人工智能（GenerativeAI）从概念验证迈向规模化商业落地，AIAgent已成为企业核心业务流程的重要组成部分。然而，当模型调用日益便捷时，核心痛点已不再是模型本身，而是集中在一个关键要素上：数据。AIAgent的落地瓶颈已从技术能力转向高质量、高相关性、安全合规的数据供给。企业面临的核心挑战在于：数据孤岛导致知识库分散，通用大模型难以理解专业业务传统数据管理依赖人工开发维护，
【TVM 教程】如何处理 TVM 报错
ApacheTVM是一个深度的深度学习编译框架，适用于CPU、GPU和各种机器学习加速芯片。更多TVM中文文档可访问→https://tvm.hyper.ai/运行TVM时，可能会遇到如下报错：---------------------------------------------------------------AnerroroccurredduringtheexecutionofTVM.F
【PaddleOCR】OCR文本检测与文本识别数据集整理，持续更新......
博主简介：曾任某智慧城市类企业算法总监，目前在美国市场的物流公司从事高级算法工程师一职，深耕人工智能领域，精通python数据挖掘、可视化、机器学习等，发表过AI相关的专利并多次在AI类比赛中获奖。CSDN人工智能领域的优质创作者，提供AI相关的技术咨询、项目开发和个性化解决方案等服务，如有需要请站内私信或者联系任意文章底部的的VX名片（ID：xf982831907）博主粉丝群介绍：①群内初中生、
多模态大模型的技术应用与未来展望：重构AI交互范式的新引擎 zhaoyi_he 重构人工智能
一、引言：为什么多模态是AI发展的下一场革命？过去十年，深度学习推动了计算机视觉和自然语言处理的飞跃，但两者的发展路径长期割裂。随着生成式AI和大模型时代的到来，**多模态大模型（MultimodalFoundationModels）**以统一的建模方式处理图像、文本、音频、视频等多源数据，重塑了“感知-认知-决策”链条，为AGI迈出关键一步。OpenAI的GPT-4o、Google的Gemini
探索 Qwen3-0.6B：轻量级大模型的技术突破与应用潜力
在大模型技术飞速发展的今天，轻量化、高性能的模型成为业界关注的焦点。Qwen3-0.6B作为阿里推出的轻量级大模型，凭借其独特的技术架构和卓越性能，在众多模型中脱颖而出。本文将深入探讨Qwen3-0.6B的技术特性、优势以及应用场景，带你领略这款轻量级大模型的魅力。一、Qwen3-0.6B核心技术架构Qwen3-0.6B基于Transformer架构进行优化，采用了一系列先进的技术手段，在保证模型
使用 C++ 实现 MFCC 特征提取与说话人识别系统 whoarethenext c++开发语言 mfcc 语音识别
使用C++实现MFCC特征提取与说话人识别系统在音频处理和人工智能领域，C++凭借其卓越的性能和对硬件的底层控制能力，在实时音频分析、嵌入式设备和高性能计算场景中占据着不可或缺的地位。本文将引导你了解如何使用C++库计算核心的音频特征——梅尔频率倒谱系数(MFCCs)，并进一步利用这些特征构建一个说话人识别（声纹识别）系统。Part1:在C/C++中计算MFCCs直接从零开始实现MFCC的所有计算
PyTorch中 item()、tolist()使用详解和实战示例点云SLAM PyTorch深度学习 pytorch 人工智能 python 深度学习张量的操作 item tolist
在PyTorch中，.item()和.tolist()是两个常用于从Tensor中提取Python原生数据的方法，尤其在调试、日志记录或将结果传给非张量库时非常有用。下面是它们的详解与代码示例。1..item()方法用途：将仅包含一个元素的张量（即标量张量）转换为对应的Python原生数据类型（float,int,等）。限制：只能用于只包含一个元素的Tensor，否则会报错。示例代码：import
Transformer模型压缩：结构化剪枝与混合精度量化研究 pk_xz123456 仿真模型机器学习深度学习 transformer 剪枝深度学习
Transformer模型压缩：结构化剪枝与混合精度量化研究摘要本文针对Transformer模型在实际部署中面临的计算资源消耗大、内存占用高和推理延迟等问题，提出了一种结合结构化剪枝与混合精度量化的综合压缩方案。我们首先分析了Transformer模型的结构特点及其在计算效率方面的瓶颈，然后系统地研究了结构化剪枝和混合精度量化的理论基础与实现方法。通过实验验证，我们的方法在保持模型性能的同时显著
ImportError: /nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4 爱编程的喵喵 Python基础课程 python ImportError torch nvJitLink 解决方案
大家好，我是爱编程的喵喵。双985硕士毕业，现担任全栈工程师一职，热衷于将数据思维应用到工作与生活中。从事机器学习以及相关的前后端开发工作。曾在阿里云、科大讯飞、CCF等比赛获得多次Top名次。现为CSDN博客专家、人工智能领域优质创作者。喜欢通过博客创作的方式对所学的知识进行总结与归纳，不仅形成深入且独到的理解，而且能够帮助新手快速入门。本文主要介绍了ImportError:/home/
【机器学习&深度学习】多分类评估策略一叶千舟深度学习【理论】深度学习【应用必备常识】大数据人工智能
目录前言一、多分类3大策略✅宏平均（MacroAverage）✅加权平均（WeightedAverage）✅微平均（MicroAverage）二、类比理解2.1宏平均（MacroAverage）2.1.1计算方式2.1.2适合场景2.1.3宏平均不适用的场景2.1.4宏平均一般用在哪些指标上？2.1.5怎么看macroavg指标？2.1.6宏平均值低说明了什么？2.1.7从宏平均指标中定位模型短板
error -- unsupported GNU version gcc later than 10 are not supported；（gcc、g++）众人（某音、某书同名）服务器 linux 运维
服务器跑dit时编译flash-atten以及pytorch的cuda版本检查出错，分别报错题目以及如下：想了下是系统找不到编译器subprocess.CalledProcessError:Command'['which','c++']'returnednon-zeroexitstatus1.备案，以后有人要用12我还得换回来方案一：更改gcc和gcc+的版本没有合适的版本的话需要root权限指定
戴尔笔记本win8系统改装win7系统 sophia天雪 win7 戴尔改装系统 win8
戴尔win8 系统改装win7 系统详述第一步：使用U盘制作虚拟光驱： 1）下载安装UltraISO：注册码可以在网上搜索。 2）启动UltraISO，点击“文件”—》“打开”按钮，打开已经准备好的ISO镜像文
BeanUtils.copyProperties使用笔记 bylijinnan java
BeanUtils.copyProperties VS PropertyUtils.copyProperties 两者最大的区别是： BeanUtils.copyProperties会进行类型转换，而PropertyUtils.copyProperties不会。既然进行了类型转换，那BeanUtils.copyProperties的速度比不上PropertyUtils.copyProp
MyEclipse中文乱码问题 0624chenhong MyEclipse
一、设置新建常见文件的默认编码格式，也就是文件保存的格式。在不对MyEclipse进行设置的时候，默认保存文件的编码，一般跟简体中文操作系统（如windows2000，windowsXP）的编码一致，即GBK。在简体中文系统下，ANSI 编码代表 GBK编码;在日文操作系统下，ANSI 编码代表 JIS 编码。 Window-->Preferences-->General -
发送邮件不懂事的小屁孩 send email
import org.apache.commons.mail.EmailAttachment; import org.apache.commons.mail.EmailException; import org.apache.commons.mail.HtmlEmail; import org.apache.commons.mail.MultiPartEmail;
动画合集换个号韩国红果果 html css
动画指一种样式变为另一种样式 keyframes应当始终定义0 100 过程 1 transition 制作鼠标滑过图片时的放大效果 css .wrap{ width: 340px;height: 340px; position: absolute; top: 30%; left: 20%; overflow: hidden; bor
网络最常见的攻击方式竟然是SQL注入蓝儿唯美 sql注入
NTT研究表明，尽管SQL注入（SQLi）型攻击记录详尽且为人熟知，但目前网络应用程序仍然是SQLi攻击的重灾区。信息安全和风险管理公司NTTCom Security发布的《2015全球智能威胁风险报告》表明，目前黑客攻击网络应用程序方式中最流行的，要数SQLi攻击。报告对去年发生的60亿攻击行为进行分析，指出SQLi攻击是最常见的网络应用程序攻击方式。全球网络应用程序攻击中，SQLi攻击占
java笔记2 a-john java
类的封装： 1，java中，对象就是一个封装体。封装是把对象的属性和服务结合成一个独立的的单位。并尽可能隐藏对象的内部细节（尤其是私有数据） 2，目的：使对象以外的部分不能随意存取对象的内部数据（如属性），从而使软件错误能够局部化，减少差错和排错的难度。 3，简单来说，“隐藏属性、方法或实现细节的过程”称为——封装。 4，封装的特性： 4.1设置
[Andengine]Error：can't creat bitmap form path “gfx/xxx.xxx” aijuans 学习Android遇到的错误
最开始遇到这个错误是很早以前了，以前也没注意，只当是一个不理解的bug，因为所有的texture，textureregion都没有问题，但是就是提示错误。昨天和美工要图片，本来是要背景透明的png格式，可是她却给了我一个jpg的。说明了之后她说没法改，因为没有png这个保存选项。我就看了一下，和她要了psd的文件，还好我有一点
自己写的一个繁体到简体的转换程序 asialee java 转换繁体 filter 简体
今天调研一个任务，基于java的filter实现繁体到简体的转换，于是写了一个demo，给各位博友奉上，欢迎批评指正。实现的思路是重载request的调取参数的几个方法，然后做下转换。
android意图和意图监听器技术百合不是茶 android 显示意图隐式意图意图监听器
Intent是在activity之间传递数据;Intent的传递分为显示传递和隐式传递显式意图：调用Intent.setComponent() 或 Intent.setClassName() 或 Intent.setClass()方法明确指定了组件名的Intent为显式意图，显式意图明确指定了Intent应该传递给哪个组件。隐式意图;不指明调用的名称,根据设
spring3中新增的@value注解 bijian1013 java spring @Value
在spring 3.0中，可以通过使用@value，对一些如xxx.properties文件中的文件，进行键值对的注入，例子如下： 1.首先在applicationContext.xml中加入： <beans xmlns="http://www.springframework.
Jboss启用CXF日志 sunjing log jboss CXF
1. 在standalone.xml配置文件中添加system-properties： <system-properties> <property name="org.apache.cxf.logging.enabled" value=&
【Hadoop三】Centos7_x86_64部署Hadoop集群之编译Hadoop源代码 bit1129 centos
编译必需的软件 Firebugs3.0.0 Maven3.2.3 Ant JDK1.7.0_67 protobuf-2.5.0 Hadoop 2.5.2源码包 Firebugs3.0.0 http://sourceforge.jp/projects/sfnet_findbug
struts2验证框架的使用和扩展白糖_ 框架 xml bean struts 正则表达式
struts2能够对前台提交的表单数据进行输入有效性校验，通常有两种方式： 1、在Action类中通过validatexx方法验证，这种方式很简单，在此不再赘述； 2、通过编写xx-validation.xml文件执行表单验证，当用户提交表单请求后，struts会优先执行xml文件，如果校验不通过是不会让请求访问指定action的。本文介绍一下struts2通过xml文件进行校验的方法并说
记录-感悟 braveCS 感悟
再翻翻以前写的感悟，有时会发现自己很幼稚，也会让自己找回初心。 2015-1-11 1. 能在工作之余学习感兴趣的东西已经很幸福了； 2. 要改变自己，不能这样一直在原来区域，要突破安全区舒适区，才能提高自己，往好的方面发展； 3. 多反省多思考；要会用工具，而不是变成工具的奴隶； 4. 一天内集中一个定长时间段看最新资讯和偏流式博
编程之美-数组中最长递增子序列 bylijinnan 编程之美
import java.util.Arrays; import java.util.Random; public class LongestAccendingSubSequence { /** * 编程之美数组中最长递增子序列 * 书上的解法容易理解 * 另一方法书上没有提到的是，可以将数组排序（由小到大）得到新的数组， * 然后求排序后的数组与原数
读书笔记5 chengxuyuancsdn 重复提交 struts2的token验证
1、重复提交 2、struts2的token验证 3、用response返回xml时的注意 1、重复提交 (1)应用场景 (1-1)点击提交按钮两次。 (1-2)使用浏览器后退按钮重复之前的操作，导致重复提交表单。 (1-3)刷新页面 (1-4)使用浏览器历史记录重复提交表单。 (1-5)浏览器重复的 HTTP 请求。 (2)解决方法 (2-1)禁掉提交按钮 (2-2)
[时空与探索]全球联合进行第二次费城实验的可能性 comsci
二次世界大战前后,由爱因斯坦参加的一次在海军舰艇上进行的物理学实验 -费城实验至今给我们大家留下很多迷团..... 关于费城实验的详细过程,大家可以在网络上搜索一下,我这里就不详细描述了在这里,我的意思是,现在
easy connect 之 ORA-12154: TNS: 无法解析指定的连接标识符 daizj oracle ORA-12154
用easy connect连接出现“tns无法解析指定的连接标示符”的错误，如下： C:\Users\Administrator>sqlplus username/[email protected]:1521/orcl SQL*Plus: Release 10.2.0.1.0 – Production on 星期一 5月 21 18:16:20 2012 Copyright (c) 198
简单排序:归并排序 dieslrae 归并排序
public void mergeSort(int[] array){ int temp = array.length/2; if(temp == 0){ return; } int[] a = new int[temp]; int
C语言中字符串的\0和空格 dcj3sjt126com c
\0 为字符串结束符，比如说： abcd (空格)cdefg；存入数组时，空格作为一个字符占有一个字节的空间，我们
解决Composer国内速度慢的办法 dcj3sjt126com Composer
用法：有两种方式启用本镜像服务： 1 将以下配置信息添加到 Composer 的配置文件 config.json 中（系统全局配置）。见“例1” 2 将以下配置信息添加到你的项目的 composer.json 文件中（针对单个项目配置）。见“例2” 为了避免安装包的时候都要执行两次查询，切记要添加禁用 packagist 的设置，如下 1 2 3 4 5
高效可伸缩的结果缓存 shuizhaosi888 高效可伸缩的结果缓存
/** * 要执行的算法，返回结果v */ public interface Computable<A, V> { public V comput(final A arg); } /** * 用于缓存数据 */ public class Memoizer<A, V> implements Computable<A,
三点定位的算法 haoningabc c 算法
三点定位，已知a,b,c三个顶点的x,y坐标和三个点都z坐标的距离，la，lb,lc 求z点的坐标原理就是围绕a,b,c 三个点画圆，三个圆焦点的部分就是所求但是，由于三个点的距离可能不准，不一定会有结果，所以是三个圆环的焦点，环的宽度开始为0，没有取到则加1 运行 gcc -lm test.c test.c代码如下 #include "stdi
epoll使用详解 jimmee c linux 服务端编程 epoll
epoll - I/O event notification facility在linux的网络编程中，很长的时间都在使用select来做事件触发。在linux新的内核中，有了一种替换它的机制，就是epoll。相比于select，epoll最大的好处在于它不会随着监听fd数目的增长而降低效率。因为在内核中的select实现中，它是采用轮询来处理的，轮询的fd数目越多，自然耗时越多。并且，在linu
Hibernate对Enum的映射的基本使用方法 linzx0212 enum Hibernate
枚举 /** * 性别枚举 */ public enum Gender { MALE(0), FEMALE(1), OTHER(2); private Gender(int i) { this.i = i; } private int i; public int getI
第10章高级事件（下） onestopweb 事件
index.html <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/
孙子兵法 roadrunners 孙子兵法
始计第一孙子曰：兵者，国之大事，死生之地，存亡之道，不可不察也。故经之以五事，校之以计，而索其情：一曰道，二曰天，三曰地，四曰将，五曰法。道者，令民于上同意，可与之死，可与之生，而不危也；天者，阴阳、寒暑、时制也；地者，远近、险易、广狭、死生也；将者，智、信、仁、勇、严也；法者，曲制、官道、主用也。凡此五者，将莫不闻，知之者胜，不知之者不胜。故校之以计，而索其情，曰
MySQL双向复制 tomcat_oracle mysql
本文包括: 主机配置从机配置建立主-从复制建立双向复制背景按照以下简单的步骤: 参考一下：在机器A配置主机(192.168.1.30) 在机器B配置从机(192.168.1.29) 我们可以使用下面的步骤来实现这一点步骤1：机器A设置主机在主机中打开配置文件 ,
zoj 3822 Domination(dp) 阿尔萨斯 Mina
题目链接：zoj 3822 Domination 题目大意：给定一个N∗M的棋盘，每次任选一个位置放置一枚棋子，直到每行每列上都至少有一枚棋子，问放置棋子个数的期望。解题思路：大白书上概率那一张有一道类似的题目，但是因为时间比较久了，还是稍微想了一下。dp[i][j][k]表示i行j列上均有至少一枚棋子，并且消耗k步的概率（k≤i∗j）,因为放置在i+1~n上等价与放在i+1行上，同理

Transformer架构完整代码示例

Transformer架构完整代码

参考链接：

你可能感兴趣的:(计算机视觉,人工智能,transformer,深度学习,人工智能,pytorch,源代码管理)