累计注意力大模型

不稳定

import paddle


class HeadLoss(paddle.nn.Layer):
    def __init__(self):
        super(HeadLoss, self).__init__()
        self.loss = paddle.nn.CrossEntropyLoss()

    def forward(self, x_list, label):
        loss = 0
        h = x_list[0].shape[-1]
        p = len(x_list) + 1
        acc_data = 1
        for i, out in enumerate(x_list):
            i += 1
            one_label = (label % h ** (p - i) // h ** ((p - i) - 1)).astype("int64")
            loss += self.loss(out.reshape([-1, h]), one_label.reshape([-1]))
            with paddle.no_grad():
                acc_data *= (paddle.argmax(out, -1) == one_label).numpy()

        return loss, acc_data.mean()


class HiddenHead(paddle.nn.Layer):
    def __init__(self, voc_size=19, hidden_size=512):
        super(HiddenHead, self).__init__()

        self.hidden_size = hidden_size
        p = 0
        while True:
            voc_size //= hidden_size
            if voc_size == 0:
                break
            else:
                p += 1
        self.head = paddle.nn.LayerList(
            [paddle.nn.Linear(hidden_size, hidden_size, bias_attr=False) for _ in range(p + 1)])

    def forward(self, head_add_x):
        x0_list = []
        for i, head in enumerate(self.head):
            i += 1
            x0_list.append(head(head_add_x))
        return x0_list

    def sample(self, head_add_x):
        h = head_add_x.shape[-1]
        x0_res = 0

        p = len(self.head) + 1

        for i, head in enumerate(self.head):
            i += 1
            arg_max = head(head_add_x)
            arg_max = paddle.argmax(arg_max, -1)
            x0_res += arg_max * h ** ((p - i) - 1)
        return x0_res


class EmAdd(paddle.nn.Layer):
    def __init__(self, voc_size=9999, hidden_size=256):
        super(EmAdd, self).__init__()
        self.hidden_size = hidden_size
        p = 0
        while True:
            voc_size //= hidden_size
            if voc_size == 0:
                break
            else:
                p += 1
        self.em = paddle.nn.LayerList([paddle.nn.Embedding(hidden_size, hidden_size) for _ in range(p + 1)])
        # self.em_zero = paddle.nn.LayerList([paddle.nn.Embedding(hidden_size, hidden_size,padding_idx=0)
        # for _ in range(p + 1)])

    def forward(self, em_add_x):

        add = 0
        p = len(self.em) + 1
        mask = paddle.zeros(em_add_x.shape)
        for i, em in enumerate(self.em):
            # mask 是累加不等于0
            i += 1
            x0 = em_add_x % self.hidden_size ** (p - i) // self.hidden_size ** ((p - i) - 1)
            mask += x0
            mask = mask != 0
            mask = mask.astype("int")
            x0 = em(x0) * mask.unsqueeze(-1)
            add = paddle.sin(x0 + add)

        return add


class ReNet(paddle.nn.Layer):
    def __init__(self, voc_size, hidden_dim, num_head):
        super(ReNet, self).__init__()
        self.group_norm = paddle.nn.GroupNorm
        self.em = EmAdd(voc_size, hidden_dim * num_head)
        self.head = num_head

    def forward(self, x):
        x = self.em(x)
        seq_len = x.shape[1]
        bsz = x.shape[0]
        x = x.reshape([bsz, seq_len, self.head, -1]).transpose([0, 2, 1, 3])

        q = paddle.sin(x)
        k = paddle.sin(q + x)
        v = paddle.sin(k + x)
        attention= self.ParallelRetention(seq_len,q, k, v)
        return
    def sample_forward(self, x):
        x = self.em(x)
        seq_len = x.shape[1]
        bsz = x.shape[0]
        x = x.reshape([bsz, seq_len, self.head, -1]).transpose([0, 2, 1, 3])

        q = paddle.sin(x)
        k = paddle.sin(q + x)
        v = paddle.sin(k + x)
        attention,state = self.RecurrentRetention(q, k, v,paddle.zeros(q.shape),"")
        return x

    def ParallelRetention(self,seq_len,
                          q,  # bsz*num_head*len*qk_dim
                          k,  # bsz*num_head*len*qk_dim
                          v,  # bsz*num_head*len*v_dim
                          ):
        
        
        decay_mask = paddle.triu(paddle.ones([seq_len, seq_len])).T  # num_head*len*len
        retention = q @ k.transpose([0, 1, 3, 2])
        retention = retention * decay_mask
        output = retention @ v

        current_kv = k.unsqueeze(-1) * v.unsqueeze(-2)
        output = paddle.sum(q.unsqueeze(-1) * current_kv, -2)
        output = self.group_norm(self.head, self.head)(output)
        return output
def demo():
        
    import numpy as np
    # 定义矩阵 A,B,C,D
    kup = np.array([[5, 2],
                  [3, 4]])
    
    vup = np.array([[5, 6],
                  [7, 8]])
    
    knext = np.array([[9, 10],
                  [11, 12]])
    
    vnext = np.array([[13, 14],
                  [15, 16]])
    
    # 重新排列顺序为:(kup @ (vup + vnext)) + (knext @ (vup + vnext))
    # 在kup 和 vup 完全 在s 
    left = np.dot(kup, vup + vnext) + np.dot(knext, vup + vnext)
    
    # 右边的等式:(A + C) @ (B + D)
    right = np.dot(kup + knext, vup + vnext)
    
    # 检查左边和右边是否相等
    if np.array_equal(left, right):
        print("(kup @ (vup + vnext)) + (knext @ (vup + vnext))")
    else:
        print("不相等")
        
if __name__ == '__main__':
    net = ReNet(12935, 128, 8)
    # net(paddle.randint(1, 123, [3, 23]))
    # net.sample_forward(paddle.randint(1, 123, [3, 23]))
    s=10
    h=12
    a=paddle.randint(1, 123, [s,h]).astype("float32")
    b=paddle.randint(1, 123, [s,h]).astype("float32")
    c=paddle.randint(1, 123, [s,h]).astype("float32")
    print()




#  A1*A4+A2*A5+A3*A6  ,A1*B4+A2*B5+A3*B6
#  B1*A4+B2*A5+B3*A6  ,B1*B4+B2*B5+B3*B6

#  A7(A1*A4+A2*A5+A3*A6) + B7*(A1*B4+A2*B5+A3*B6),  A8(A1*A4+A2*A5+A3*A6) + B8*(A1*B4+A2*B5+A3*B6)
#  A7*(B1*A4+B2*A5+B3*A6)+B7*(B1*B4+B2*B5+B3*B6) ,A8*(B1*A4+B2*A5+B3*A6)+B8*(B1*B4+B2*B5+B3*B6)


稳定

import paddle
import numpy as np


class HeadLoss(paddle.nn.Layer):
    def __init__(self):
        super(HeadLoss, self).__init__()
        self.loss = paddle.nn.CrossEntropyLoss()

    def forward(self, x_list, label):
        loss = 0
        h = x_list[0].shape[-1]
        p = len(x_list) + 1
        acc_data = 1
        for i, out in enumerate(x_list):
            i += 1
            one_label = (label % h ** (p - i) // h ** ((p - i) - 1)).astype("int64")
            loss += self.loss(out.reshape([-1, h]), one_label.reshape([-1]))
            with paddle.no_grad():
                acc_data *= (paddle.argmax(out, -1) == one_label).numpy()

        return loss, acc_data.mean()


class HiddenHead(paddle.nn.Layer):
    def __init__(self, voc_size=19, hidden_size=512):
        super(HiddenHead, self).__init__()

        self.hidden_size = hidden_size
        p = 0
        while True:
            voc_size //= hidden_size
            if voc_size == 0:
                break
            else:
                p += 1
        self.head = paddle.nn.LayerList(
            [paddle.nn.Linear(hidden_size, hidden_size, bias_attr=False) for _ in range(p + 1)])

    def forward(self, head_add_x):
        x0_list = []
        for i, head in enumerate(self.head):
            i += 1
            x0_list.append(head(head_add_x))
        return x0_list

    def sample(self, head_add_x):
        h = head_add_x.shape[-1]
        x0_res = 0

        p = len(self.head) + 1

        for i, head in enumerate(self.head):
            i += 1
            arg_max = head(head_add_x)
            arg_max = paddle.argmax(arg_max, -1)
            x0_res += arg_max * h ** ((p - i) - 1)
        return x0_res


class EmAdd(paddle.nn.Layer):
    def __init__(self, voc_size=9999, hidden_size=256):
        super(EmAdd, self).__init__()
        self.hidden_size = hidden_size
        p = 0
        while True:
            voc_size //= hidden_size
            if voc_size == 0:
                break
            else:
                p += 1
        self.em = paddle.nn.LayerList([paddle.nn.Embedding(hidden_size, hidden_size) for _ in range(p + 1)])
        # self.em_zero = paddle.nn.LayerList([paddle.nn.Embedding(hidden_size, hidden_size,padding_idx=0)
        # for _ in range(p + 1)])

    def forward(self, em_add_x):

        add = 0
        p = len(self.em) + 1
        mask = paddle.zeros(em_add_x.shape)
        for i, em in enumerate(self.em):
            # mask 是累加不等于0
            i += 1
            x0 = em_add_x % self.hidden_size ** (p - i) // self.hidden_size ** ((p - i) - 1)
            mask += x0
            mask = mask != 0
            mask = mask.astype("int")
            x0 = em(x0) * mask.unsqueeze(-1)
            add = paddle.sin(x0 + add)

        return add


class ReNet(paddle.nn.Layer):
    def __init__(self, voc_size, hidden_dim, num_head_dim, n_layers):
        super(ReNet, self).__init__()
        self.em = EmAdd(voc_size, hidden_dim * num_head_dim)
        self.qk_list = paddle.nn.LayerList(
            [paddle.nn.Linear(num_head_dim, hidden_dim, bias_attr=False) for _ in range(n_layers)])

        self.head = num_head_dim
        self.out_layer = HiddenHead(voc_size, hidden_dim)


    def forward(self, x, state):
        x = self.em(x)
        q = paddle.sin(x)
        k = paddle.sin(x + q)
        seq_len = q.shape[1]
        bsz = q.shape[0]
        q = q.reshape([bsz, seq_len, self.head, -1]).transpose([0, 2, 1, 3])
        k = k.reshape([bsz, seq_len, self.head, -1]).transpose([0, 2, 1, 3])
        mask = paddle.triu(paddle.ones([seq_len, seq_len])).T

        qk = state + paddle.sum(paddle.sin(q @ k.transpose([0, 1, 3, 2]) * mask), -1)
        state = qk[:, :, -1:]
        new_qk=0
        for one_k in self.qk_list:
            new_qk += one_k(qk.transpose([0, 2, 1]))
            new_qk=paddle.sin(new_qk)

        out = self.out_layer(new_qk)
        return out, state

    def sample(self, x, state):
        x = self.em(x)
        q = paddle.sin(x)
        k = paddle.sin(x + q)
        seq_len = q.shape[1]
        bsz = q.shape[0]
        q = q.reshape([bsz, seq_len, self.head, -1]).transpose([0, 2, 1, 3])
        k = k.reshape([bsz, seq_len, self.head, -1]).transpose([0, 2, 1, 3])
        mask = paddle.triu(paddle.ones([seq_len, seq_len])).T

        qk = state + paddle.sum(paddle.sin(q @ k.transpose([0, 1, 3, 2]) * mask), -1)
        state = qk[:, :, -1:]
        new_qk = 0
        for one_k in self.qk_list:
            new_qk += one_k(qk.transpose([0, 2, 1]))
            new_qk = paddle.sin(new_qk)

        out = self.out_layer.sample(new_qk)
        return out, state



def emheading_train_and_sample():
    print("*" * 100)
    net = ReNet(12935, 80, 80, 8)
    # net.eval()
    x = paddle.to_tensor([
        np.random.randint(1, 124, 100),
        np.random.randint(1, 124, 100),
    ], dtype='int64')

    xx = x

    # 模拟训练

    loss_f = HeadLoss()
    opt = paddle.optimizer.Adam(parameters=net.parameters(), learning_rate=0.0003)

    for i in range(80):
        out, state = net(x[:, :-1], 0)
        loss, av_ac = loss_f(out, x[:, 1:])

        print(i, loss.item(), av_ac)

        opt.clear_grad()
        loss.backward()
        opt.step()

    # 解码,验证
    net.eval()

    out, _ = net.sample(xx[:, :-1], 0)
    print((out == xx[:, 1:]).numpy().mean())
    out, _ = net.sample(xx[:, :-2], 0)
    print((out[:, -1:] == xx[:, -2:-1]).numpy().mean())
    out, _ = net.sample(xx[:, :-1], 0)
    print((out[:, -1:] == xx[:, -1:]).numpy().mean())
    for i in range(50):
        print("超长依赖检验")
        out, _ = net.sample(xx[:, :i + 30], 0)
        print((out[:, -1:] == xx[:, i + 30:i + 31]).numpy().mean())

    out, _ = net(xx[:, :-1], 0)
    loss, av_ac = loss_f(out, xx[:, 1:])
    print(loss.item(), av_ac)


# 进行模型训练和预测
if __name__ == '__main__':
    emheading_train_and_sample()

# if __name__ == '__main__':
#     # demo()
#     net = ReNet(12935, 12, 8,8)
#     net(paddle.randint(1, 123, [3, 23]), 0)

你可能感兴趣的:(pyrhon,人工智能,pandas,python)