在完成fastspeech论文学习后,对github上一个复现的仓库进行学习,帮助理解算法实现过程中的一些细节;所选择的仓库复现仓库是基于pytorch实现,链接为https://github.com/xcmyz/FastSpeech。该仓库使用的数据集为LJSpeech,数据处理部分的代码见笔记“fastspeech复现github项目–数据准备”。本笔记对模型搭建的代码进行详细的注释,项目中与模型构建相关的是transformer路径下文件以及modules.py和model.py,希望本笔记中的代码注释能帮助读者理解FastSpeech。
因FastSpeech中FFT块和用于训练duration predictor的对齐信息提取都与transformer相关的,本项目把相关的文件都放在了transformer路径下
本文件中设置数据处理中可能会使用到一些常量
PAD = 0
UNK = 1
BOS = 2
EOS = 3
PAD_WORD = ''
UNK_WORD = ''
BOS_WORD = ''
EOS_WORD = ''
本文件中是实现了一个点乘模块,用于计算注意力
import torch
import torch.nn as nn
import numpy as np
# 自定义的点乘模块,用于计算注意力
class ScaledDotProductAttention(nn.Module):
''' Scaled Dot-Product Attention '''
def __init__(self, temperature, attn_dropout=0.1):
super().__init__()
self.temperature = temperature
self.dropout = nn.Dropout(attn_dropout)
self.softmax = nn.Softmax(dim=2)
def forward(self, q, k, v, mask=None):
# 对q、k进行矩阵乘法
attn = torch.bmm(q, k.transpose(1, 2))
attn = attn / self.temperature # 除以温度参数
if mask is not None:
attn = attn.masked_fill(mask, -np.inf)
attn = self.softmax(attn)
attn = self.dropout(attn)
output = torch.bmm(attn, v)
return output, attn
本文件中定义了FFT块中的多头注意力层和基于位置的前馈神经网络(FFN)
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from transformer.Modules import ScaledDotProductAttention
import hparams as hp
# 自定义的多头注意力模块
class MultiHeadAttention(nn.Module):
''' Multi-Head Attention module '''
def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
super().__init__()
self.n_head = n_head
self.d_k = d_k
self.d_v = d_v
# 计算注意力之前,对q、k、v进行映射的线性层
self.w_qs = nn.Linear(d_model, n_head * d_k)
self.w_ks = nn.Linear(d_model, n_head * d_k)
self.w_vs = nn.Linear(d_model, n_head * d_v)
# 初始化上述线性层
nn.init.normal_(self.w_qs.weight, mean=0,
std=np.sqrt(2.0 / (d_model + d_k)))
nn.init.normal_(self.w_ks.weight, mean=0,
std=np.sqrt(2.0 / (d_model + d_k)))
nn.init.normal_(self.w_vs.weight, mean=0,
std=np.sqrt(2.0 / (d_model + d_v)))
self.attention = ScaledDotProductAttention(
temperature=np.power(d_k, 0.5))
self.layer_norm = nn.LayerNorm(d_model)
self.fc = nn.Linear(n_head * d_v, d_model)
nn.init.xavier_normal_(self.fc.weight)
self.dropout = nn.Dropout(dropout)
def forward(self, q, k, v, mask=None):
d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
sz_b, len_q, _ = q.size()
sz_b, len_k, _ = k.size()
sz_b, len_v, _ = v.size()
residual = q # 备份,用于最后与经过计算后的输出相加再输出
# 讲q、k、v转换为四维,即切分为多头
q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
# 将q、k、v中的batch_size和n_head两个维度合并,降维三维
q = q.permute(2, 0, 1, 3).contiguous().view(-1,
len_q, d_k) # (n*b) x lq x dk
k = k.permute(2, 0, 1, 3).contiguous().view(-1,
len_k, d_k) # (n*b) x lk x dk
v = v.permute(2, 0, 1, 3).contiguous().view(-1,
len_v, d_v) # (n*b) x lv x dv
# 为每个头复制一个相同的mask掩码
mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x ..
output, attn = self.attention(q, k, v, mask=mask) # 计算注意力输出和attn矩阵
output = output.view(n_head, sz_b, len_q, d_v)
output = output.permute(1, 2, 0, 3).contiguous().view(
sz_b, len_q, -1) # b x lq x (n*dv) # 先还原为四维,然后再将特征维度还原,降为三维
output = self.dropout(self.fc(output))
output = self.layer_norm(output + residual) # residual
return output, attn
# 自定义FFT中基于位置的前馈神经网络/FFN,内部均使用一位卷积,也计算了残差
class PositionwiseFeedForward(nn.Module):
''' A two-feed-forward-layer module '''
def __init__(self, d_in, d_hid, dropout=0.1):
super().__init__()
# Use Conv1D
# position-wise
self.w_1 = nn.Conv1d(
d_in, d_hid, kernel_size=hp.fft_conv1d_kernel[0], padding=hp.fft_conv1d_padding[0])
# position-wise
self.w_2 = nn.Conv1d(
d_hid, d_in, kernel_size=hp.fft_conv1d_kernel[1], padding=hp.fft_conv1d_padding[1])
self.layer_norm = nn.LayerNorm(d_in)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
residual = x
output = x.transpose(1, 2)
output = self.w_2(F.relu(self.w_1(output)))
output = output.transpose(1, 2)
output = self.dropout(output)
output = self.layer_norm(output + residual)
return output
本文件中定义了FFT块网络和后续可能会使用到一些简单模块,其中的一些模块后续也没有使用,模块的基本构造与Tocatron和Transformer-TTS相似
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
from collections import OrderedDict
from transformer.SubLayers import MultiHeadAttention, PositionwiseFeedForward
from text.symbols import symbols
# 自定义的线性全连接层
class Linear(nn.Module):
"""
Linear Module
"""
def __init__(self, in_dim, out_dim, bias=True, w_init='linear'):
"""
:param in_dim: dimension of input
:param out_dim: dimension of output
:param bias: boolean. if True, bias is included.
:param w_init: str. weight inits with xavier initialization.
"""
super(Linear, self).__init__()
self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias)
nn.init.xavier_uniform_(
self.linear_layer.weight,
gain=nn.init.calculate_gain(w_init))
def forward(self, x):
return self.linear_layer(x)
# 预处理网络,ReLU激活,带有dropout的两层全连接层
class PreNet(nn.Module):
"""
Pre Net before passing through the network
"""
def __init__(self, input_size, hidden_size, output_size, p=0.5):
"""
:param input_size: dimension of input
:param hidden_size: dimension of hidden unit
:param output_size: dimension of output
"""
super(PreNet, self).__init__()
self.input_size = input_size
self.output_size = output_size
self.hidden_size = hidden_size
self.layer = nn.Sequential(OrderedDict([
('fc1', Linear(self.input_size, self.hidden_size)),
('relu1', nn.ReLU()),
('dropout1', nn.Dropout(p)),
('fc2', Linear(self.hidden_size, self.output_size)),
('relu2', nn.ReLU()),
('dropout2', nn.Dropout(p)),
]))
def forward(self, input_):
out = self.layer(input_)
return out
# 自定义的卷积层,内部使用一维卷积
class Conv(nn.Module):
"""
Convolution Module
"""
def __init__(self,
in_channels,
out_channels,
kernel_size=1,
stride=1,
padding=0,
dilation=1,
bias=True,
w_init='linear'):
"""
:param in_channels: dimension of input
:param out_channels: dimension of output
:param kernel_size: size of kernel
:param stride: size of stride
:param padding: size of padding
:param dilation: dilation rate
:param bias: boolean. if True, bias is included.
:param w_init: str. weight inits with xavier initialization.
"""
super(Conv, self).__init__()
self.conv = nn.Conv1d(in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
nn.init.xavier_uniform_(
self.conv.weight, gain=nn.init.calculate_gain(w_init))
def forward(self, x):
x = self.conv(x)
return x
# FFT块
class FFTBlock(torch.nn.Module):
"""FFT Block"""
def __init__(self,
d_model,
d_inner,
n_head,
d_k,
d_v,
dropout=0.1):
super(FFTBlock, self).__init__()
self.slf_attn = MultiHeadAttention(
n_head, d_model, d_k, d_v, dropout=dropout)
self.pos_ffn = PositionwiseFeedForward(
d_model, d_inner, dropout=dropout)
def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
enc_output, enc_slf_attn = self.slf_attn(
enc_input, enc_input, enc_input, mask=slf_attn_mask)
enc_output *= non_pad_mask # 只保留非pad的元素
enc_output = self.pos_ffn(enc_output)
enc_output *= non_pad_mask
return enc_output, enc_slf_attn
# 能自动计算pad的一维卷积
class ConvNorm(torch.nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size=1,
stride=1,
padding=None,
dilation=1,
bias=True,
w_init_gain='linear'):
super(ConvNorm, self).__init__()
if padding is None:
assert(kernel_size % 2 == 1)
padding = int(dilation * (kernel_size - 1) / 2)
self.conv = torch.nn.Conv1d(in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
torch.nn.init.xavier_uniform_(
self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
def forward(self, signal):
conv_signal = self.conv(signal)
return conv_signal
# 后处理网络,由5个卷积核大小为5,通道数为512的一维卷积堆叠;与tocatron和Transformer-TTS中一致
class PostNet(nn.Module):
"""
PostNet: Five 1-d convolution with 512 channels and kernel size 5
"""
def __init__(self,
n_mel_channels=80,
postnet_embedding_dim=512,
postnet_kernel_size=5,
postnet_n_convolutions=5):
super(PostNet, self).__init__()
self.convolutions = nn.ModuleList()
self.convolutions.append(
nn.Sequential(
ConvNorm(n_mel_channels,
postnet_embedding_dim,
kernel_size=postnet_kernel_size,
stride=1,
padding=int((postnet_kernel_size - 1) / 2),
dilation=1,
w_init_gain='tanh'),
nn.BatchNorm1d(postnet_embedding_dim))
)
for i in range(1, postnet_n_convolutions - 1):
self.convolutions.append(
nn.Sequential(
ConvNorm(postnet_embedding_dim,
postnet_embedding_dim,
kernel_size=postnet_kernel_size,
stride=1,
padding=int((postnet_kernel_size - 1) / 2),
dilation=1,
w_init_gain='tanh'),
nn.BatchNorm1d(postnet_embedding_dim))
)
self.convolutions.append(
nn.Sequential(
ConvNorm(postnet_embedding_dim,
n_mel_channels,
kernel_size=postnet_kernel_size,
stride=1,
padding=int((postnet_kernel_size - 1) / 2),
dilation=1,
w_init_gain='linear'),
nn.BatchNorm1d(n_mel_channels))
)
def forward(self, x):
x = x.contiguous().transpose(1, 2)
for i in range(len(self.convolutions) - 1):
x = F.dropout(torch.tanh(
self.convolutions[i](x)), 0.5, self.training)
x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
x = x.contiguous().transpose(1, 2)
return x
本文件中主要实现Fast Speech中的编码器和解码器
import torch
import torch.nn as nn
import numpy as np
import hparams as hp
import transformer.Constants as Constants
from transformer.Layers import FFTBlock, PreNet, PostNet, Linear
# 获取序列中非pad元素对应的mask
def get_non_pad_mask(seq):
assert seq.dim() == 2
return seq.ne(Constants.PAD).type(torch.float).unsqueeze(-1)
# 正弦位置编码
def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
''' Sinusoid position encoding table '''
def cal_angle(position, hid_idx):
return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
def get_posi_angle_vec(position):
return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
sinusoid_table = np.array([get_posi_angle_vec(pos_i)
for pos_i in range(n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
# 将正弦位置编码中与padding_idx对应位置的值设置为0
if padding_idx is not None:
# zero vector for padding dimension
sinusoid_table[padding_idx] = 0.
return torch.FloatTensor(sinusoid_table)
# 为每个key序列生成对应的掩码
def get_attn_key_pad_mask(seq_k, seq_q):
''' For masking out the padding part of key sequence. '''
# Expand to fit the shape of key query attention matrix.
len_q = seq_q.size(1)
padding_mask = seq_k.eq(Constants.PAD)
padding_mask = padding_mask.unsqueeze(
1).expand(-1, len_q, -1) # b x lq x lk
return padding_mask
# Length Regulator之前的网络架构,即编码器
class Encoder(nn.Module):
''' Encoder '''
def __init__(self,
n_src_vocab=hp.vocab_size,
len_max_seq=hp.vocab_size,
d_word_vec=hp.encoder_dim,
n_layers=hp.encoder_n_layer,
n_head=hp.encoder_head,
d_k=hp.encoder_dim // hp.encoder_head,
d_v=hp.encoder_dim // hp.encoder_head,
d_model=hp.encoder_dim,
d_inner=hp.encoder_conv1d_filter_size,
dropout=hp.dropout):
super(Encoder, self).__init__()
n_position = len_max_seq + 1
# 相当于词嵌入层
self.src_word_emb = nn.Embedding(n_src_vocab,
d_word_vec,
padding_idx=Constants.PAD)
# 构建位置编码
self.position_enc = nn.Embedding.from_pretrained(
get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0),
freeze=True)
# 堆叠多个FFT快
self.layer_stack = nn.ModuleList([FFTBlock(
d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers)])
def forward(self, src_seq, src_pos, return_attns=False):
enc_slf_attn_list = []
# -- Prepare masks
slf_attn_mask = get_attn_key_pad_mask(seq_k=src_seq, seq_q=src_seq)
non_pad_mask = get_non_pad_mask(src_seq)
# -- Forward,将音素序列转为嵌入向量
enc_output = self.src_word_emb(src_seq) + self.position_enc(src_pos)
# 进行注意力计算
for enc_layer in self.layer_stack:
enc_output, enc_slf_attn = enc_layer(
enc_output,
non_pad_mask=non_pad_mask,
slf_attn_mask=slf_attn_mask)
if return_attns:
enc_slf_attn_list += [enc_slf_attn]
return enc_output, non_pad_mask
# Length Regulator之后的网络架构,即解码器
class Decoder(nn.Module):
""" Decoder """
def __init__(self,
len_max_seq=hp.max_seq_len,
n_layers=hp.decoder_n_layer,
n_head=hp.decoder_head,
d_k=hp.decoder_dim // hp.decoder_head,
d_v=hp.decoder_dim // hp.decoder_head,
d_model=hp.decoder_dim,
d_inner=hp.decoder_conv1d_filter_size,
dropout=hp.dropout):
super(Decoder, self).__init__()
n_position = len_max_seq + 1
self.position_enc = nn.Embedding.from_pretrained(
get_sinusoid_encoding_table(n_position, d_model, padding_idx=0),
freeze=True)
self.layer_stack = nn.ModuleList([FFTBlock(
d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers)])
def forward(self, enc_seq, enc_pos, return_attns=False):
dec_slf_attn_list = []
# -- Prepare masks
slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos)
non_pad_mask = get_non_pad_mask(enc_pos)
# -- Forward
dec_output = enc_seq + self.position_enc(enc_pos) # 进行二次位置编码
for dec_layer in self.layer_stack:
dec_output, dec_slf_attn = dec_layer(
dec_output,
non_pad_mask=non_pad_mask,
slf_attn_mask=slf_attn_mask)
if return_attns:
dec_slf_attn_list += [dec_slf_attn]
return dec_output
相比于transformer路径下文件中的模块,本文件中的各个模块更加上层,包括Length Regulator等
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
from numba import jit
import numpy as np
import copy
import math
import hparams as hp
import utils
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
''' Sinusoid position encoding table '''
def cal_angle(position, hid_idx):
return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
def get_posi_angle_vec(position):
return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
sinusoid_table = np.array([get_posi_angle_vec(pos_i)
for pos_i in range(n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
if padding_idx is not None:
# zero vector for padding dimension
sinusoid_table[padding_idx] = 0.
return torch.FloatTensor(sinusoid_table)
def clones(module, N):
return nn.ModuleList([copy.deepcopy(module) for _ in range(N)]) # 深层clone,即参数不共享
# @jit(nopython=True)
def create_alignment(base_mat, duration_predictor_output):
"""
基于对齐信息(音素序列对应的音素持续时间)调整对齐矩阵
@param base_mat:元素值全为0的初始对齐矩阵,[batch_size, max_mel_length, max_sequence_len]
@param duration_predictor_output:音素持续时间矩阵,[batch_size, max_sequence_len]
@return:经过调整后的对齐矩阵
"""
N, L = duration_predictor_output.shape # 音素持续时间矩阵的尺寸
for i in range(N): # batch中的第i个音素序列
count = 0
for j in range(L): # 第i个音素序列中第j个音素
for k in range(duration_predictor_output[i][j]): # duration_predictor_output[i][j]值表示[i,j]位置对应音素的长度
base_mat[i][count+k][j] = 1 # duration_predictor_output[i][j]长度范围内连续位置均设置为1,出现几个1表示重复几次
count = count + duration_predictor_output[i][j]
return base_mat
class LengthRegulator(nn.Module):
""" Length Regulator """
def __init__(self):
super(LengthRegulator, self).__init__()
self.duration_predictor = DurationPredictor()
# 对输入的音素序列x进行长度调整
def LR(self, x, duration_predictor_output, mel_max_length=None):
"""
基于音素持续时间将音素序列长度与音素谱图序列长度对齐一致
@param x:经过FFT块转换后的音素序列,[batch_size, max_sequence_len, encoder_dim]
@param duration_predictor_output:音素持续时间矩阵,[batch_size, max_sequence_len]
@param mel_max_length:音素谱图序列中最大长度
@return:长度经过调整后的音素序列,[batch_size, expand_max_len, encoder_dim]
"""
# 获取所有音素谱图序列中长度最大值
expand_max_len = torch.max(
torch.sum(duration_predictor_output, -1), -1)[0]
# 以0初始化对齐矩阵,[batch_size, expand_max_len, max_sequence_len]
alignment = torch.zeros(duration_predictor_output.size(0),
expand_max_len,
duration_predictor_output.size(1)).numpy()
# 以音素持续时间调整对齐矩阵,[batch_size, expand_max_len, max_sequence_len]
alignment = create_alignment(alignment,
duration_predictor_output.cpu().numpy())
alignment = torch.from_numpy(alignment).to(device)
# 对齐矩阵与输入x进行矩阵相乘运算后,音素序列x长度将于音素谱图序列长度一致
output = alignment @ x # [batch_size, expand_max_len, encoder_dim]
if mel_max_length: # 如果设置了音素谱图序列最大值,还需要使用0进行pad
output = F.pad(
output, (0, 0, 0, mel_max_length-output.size(1), 0, 0))
return output
def forward(self, x, alpha=1.0, target=None, mel_max_length=None): # target是预先提取的音素持续时间,其有无决定模块输出
duration_predictor_output = self.duration_predictor(x) # duration predictor计算输出的音素持续时间
if target is not None: # 如果target存在,表示训练过程,目的是作为监督信息训练duration predictor
output = self.LR(x, target, mel_max_length=mel_max_length)
return output, duration_predictor_output
else: # target不存在,为推理过程,直接使用duration predictor输出的音素持续时间
duration_predictor_output = (
(duration_predictor_output + 0.5) * alpha).int()
output = self.LR(x, duration_predictor_output)
mel_pos = torch.stack(
[torch.Tensor([i+1 for i in range(output.size(1))])]).long().to(device)
return output, mel_pos
# 音素持续时间预测模块,内部是两层一维卷积再加上一个全连接层
class DurationPredictor(nn.Module):
""" Duration Predictor """
def __init__(self):
super(DurationPredictor, self).__init__()
self.input_size = hp.encoder_dim
self.filter_size = hp.duration_predictor_filter_size
self.kernel = hp.duration_predictor_kernel_size
self.conv_output_size = hp.duration_predictor_filter_size
self.dropout = hp.dropout
self.conv_layer = nn.Sequential(OrderedDict([
("conv1d_1", Conv(self.input_size,
self.filter_size,
kernel_size=self.kernel,
padding=1)),
("layer_norm_1", nn.LayerNorm(self.filter_size)),
("relu_1", nn.ReLU()),
("dropout_1", nn.Dropout(self.dropout)),
("conv1d_2", Conv(self.filter_size,
self.filter_size,
kernel_size=self.kernel,
padding=1)),
("layer_norm_2", nn.LayerNorm(self.filter_size)),
("relu_2", nn.ReLU()),
("dropout_2", nn.Dropout(self.dropout))
]))
self.linear_layer = Linear(self.conv_output_size, 1)
self.relu = nn.ReLU()
def forward(self, encoder_output):
out = self.conv_layer(encoder_output)
out = self.linear_layer(out)
out = self.relu(out)
out = out.squeeze()
if not self.training:
out = out.unsqueeze(0)
return out
# 一维卷积后批量正则
class BatchNormConv1d(nn.Module):
def __init__(self, in_dim, out_dim, kernel_size, stride, padding,
activation=None, w_init_gain='linear'):
super(BatchNormConv1d, self).__init__()
self.conv1d = nn.Conv1d(in_dim, out_dim,
kernel_size=kernel_size,
stride=stride, padding=padding, bias=False)
self.bn = nn.BatchNorm1d(out_dim)
self.activation = activation
torch.nn.init.xavier_uniform_(
self.conv1d.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
def forward(self, x):
x = self.conv1d(x)
if self.activation is not None:
x = self.activation(x)
return self.bn(x)
# 自定义一维卷积层,卷积计算前会将数据的后两个维度进行转置
class Conv(nn.Module):
"""
Convolution Module
"""
def __init__(self,
in_channels,
out_channels,
kernel_size=1,
stride=1,
padding=0,
dilation=1,
bias=True,
w_init='linear'):
"""
:param in_channels: dimension of input
:param out_channels: dimension of output
:param kernel_size: size of kernel
:param stride: size of stride
:param padding: size of padding
:param dilation: dilation rate
:param bias: boolean. if True, bias is included.
:param w_init: str. weight inits with xavier initialization.
"""
super(Conv, self).__init__()
self.conv = nn.Conv1d(in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
nn.init.xavier_uniform_(
self.conv.weight, gain=nn.init.calculate_gain(w_init))
def forward(self, x):
x = x.contiguous().transpose(1, 2)
x = self.conv(x)
x = x.contiguous().transpose(1, 2)
return x
# 自定义线性全连接层
class Linear(nn.Module):
"""
Linear Module
"""
def __init__(self, in_dim, out_dim, bias=True, w_init='linear'):
"""
:param in_dim: dimension of input
:param out_dim: dimension of output
:param bias: boolean. if True, bias is included.
:param w_init: str. weight inits with xavier initialization.
"""
super(Linear, self).__init__()
self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias)
nn.init.xavier_uniform_(
self.linear_layer.weight,
gain=nn.init.calculate_gain(w_init))
def forward(self, x):
return self.linear_layer(x)
class Highway(nn.Module):
def __init__(self, in_size, out_size):
super(Highway, self).__init__()
self.H = nn.Linear(in_size, out_size)
self.H.bias.data.zero_()
self.T = nn.Linear(in_size, out_size)
self.T.bias.data.fill_(-1)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self, inputs):
H = self.relu(self.H(inputs))
T = self.sigmoid(self.T(inputs))
return H * T + inputs * (1.0 - T)
# 前处理网络
class Prenet(nn.Module):
"""
Prenet before passing through the network
"""
def __init__(self, input_size, hidden_size, output_size):
super(Prenet, self).__init__()
self.input_size = input_size
self.output_size = output_size
self.hidden_size = hidden_size
self.layer = nn.Sequential(OrderedDict([
('fc1', Linear(self.input_size, self.hidden_size)),
('relu1', nn.ReLU()),
('dropout1', nn.Dropout(0.5)),
('fc2', Linear(self.hidden_size, self.output_size)),
('relu2', nn.ReLU()),
('dropout2', nn.Dropout(0.5)),
]))
def forward(self, x):
out = self.layer(x)
return out
# 此模块将mel谱图转换为线性幅度谱图,以供griff-lim或其它vocoder使用幅度谱图重建音频
class CBHG(nn.Module):
"""CBHG module: a recurrent neural network composed of:
- 1-d convolution banks
- Highway networks + residual connections
- Bidirectional gated recurrent units
"""
def __init__(self, in_dim, K=16, projections=[128, 128]):
super(CBHG, self).__init__()
self.in_dim = in_dim
self.relu = nn.ReLU()
self.conv1d_banks = nn.ModuleList(
[BatchNormConv1d(in_dim, in_dim, kernel_size=k, stride=1,
padding=k // 2, activation=self.relu)
for k in range(1, K + 1)])
self.max_pool1d = nn.MaxPool1d(kernel_size=2, stride=1, padding=1)
in_sizes = [K * in_dim] + projections[:-1]
activations = [self.relu] * (len(projections) - 1) + [None]
self.conv1d_projections = nn.ModuleList(
[BatchNormConv1d(in_size, out_size, kernel_size=3, stride=1,
padding=1, activation=ac)
for (in_size, out_size, ac) in zip(
in_sizes, projections, activations)])
self.pre_highway = nn.Linear(projections[-1], in_dim, bias=False)
self.highways = nn.ModuleList(
[Highway(in_dim, in_dim) for _ in range(4)])
self.gru = nn.GRU(
in_dim, in_dim, 1, batch_first=True, bidirectional=True)
def forward(self, inputs, input_lengths=None):
# (B, T_in, in_dim)
x = inputs
# Needed to perform conv1d on time-axis
# (B, in_dim, T_in)
if x.size(-1) == self.in_dim:
x = x.transpose(1, 2)
T = x.size(-1)
# (B, in_dim*K, T_in)
# Concat conv1d bank outputs
x = torch.cat([conv1d(x)[:, :, :T]
for conv1d in self.conv1d_banks], dim=1)
assert x.size(1) == self.in_dim * len(self.conv1d_banks)
x = self.max_pool1d(x)[:, :, :T]
for conv1d in self.conv1d_projections:
x = conv1d(x)
# (B, T_in, in_dim)
# Back to the original shape
x = x.transpose(1, 2)
if x.size(-1) != self.in_dim:
x = self.pre_highway(x)
# Residual connection
x += inputs
for highway in self.highways:
x = highway(x)
if input_lengths is not None:
x = nn.utils.rnn.pack_padded_sequence(
x, input_lengths, batch_first=True)
# (B, T_in, in_dim*2)
self.gru.flatten_parameters()
outputs, _ = self.gru(x)
if input_lengths is not None:
outputs, _ = nn.utils.rnn.pad_packed_sequence(
outputs, batch_first=True)
return outputs
if __name__ == "__main__":
# TEST,可以为音素序列长度调整的过程
a = torch.Tensor([[2, 3, 4], [1, 2, 3]]) # 音素序列1
b = torch.Tensor([[5, 6, 7], [7, 8, 9]]) # 音素序列2
c = torch.stack([a, b]) # 相当于一个batch的音素序列
d = torch.Tensor([[1, 4], [6, 3]]).int() # 相当于duration predictor的输出,即音素持续时间
expand_max_len = torch.max(torch.sum(d, -1), -1)[0] # 获取同一batch中音素谱图序列的长度最大值
base = torch.zeros(c.size(0), expand_max_len, c.size(1)) # 以0初始化对齐矩阵
# 基于音素持续时间调整对齐矩阵
alignment = create_alignment(base.numpy(), d.numpy())
print(alignment)
# 将对齐矩阵与batch数据执行矩阵乘法,使得音素序列长度得到调整,与音素谱图序列长度对齐
# 结果如d[0][0]的值为1,那么c中[0][0]位置的音素就出现1次,结果如d[0][1]的值为4,那么c中[0][1]位置的音素就出现4次
print(torch.from_numpy(alignment) @ c)
本文件基于前面所有的模块实现完成FastSpeech模型搭建
import torch
import torch.nn as nn
import hparams as hp
import utils
from transformer.Models import Encoder, Decoder
from transformer.Layers import Linear, PostNet
from modules import LengthRegulator, CBHG
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class FastSpeech(nn.Module):
""" FastSpeech """
def __init__(self):
super(FastSpeech, self).__init__()
self.encoder = Encoder() # Length Regulator之前网络为编码器
self.length_regulator = LengthRegulator()
self.decoder = Decoder() # Length Regulator之后网络为解码器
self.mel_linear = Linear(hp.decoder_dim, hp.num_mels)
self.postnet = CBHG(hp.num_mels, K=8, projections=[256, hp.num_mels]) # 使用CBHG作为后粗合理网络
self.last_linear = Linear(hp.num_mels * 2, hp.num_mels)
def mask_tensor(self, mel_output, position, mel_max_length):
lengths = torch.max(position, -1)[0]
mask = ~utils.get_mask_from_lengths(lengths, max_len=mel_max_length)
mask = mask.unsqueeze(-1).expand(-1, -1, mel_output.size(-1))
return mel_output.masked_fill(mask, 0.)
def forward(self, src_seq, src_pos, mel_pos=None, mel_max_length=None, length_target=None, alpha=1.0):
encoder_output, _ = self.encoder(src_seq, src_pos) # 编码器输出,[b, max_sequence_len, encoder_dim]
if self.training: # 训练
# length_regulator_output是长度调整后的音素序列,与音素谱图序列长度对齐,[b, max_mel_len, encoder_dim]
# duration_predictor_output是训练过程中duration predictor计算数据的音素持续时间,[b, max_sequence_len]
# 训练过程中,使用预先提取好的音素持续时间target作为监督信息训练duration predictor
length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output,
target=length_target,
alpha=alpha,
mel_max_length=mel_max_length)
# 解码器输出,mel谱图尺寸的数据,[b, max_mel_len, num_mels]
decoder_output = self.decoder(length_regulator_output, mel_pos)
mel_output = self.mel_linear(decoder_output) # [b, max_mel_len, num_mels]
mel_output = self.mask_tensor(mel_output, mel_pos, mel_max_length) # 将pad部分的数据置0
residual = self.postnet(mel_output) # [b, max_mel_len, num_mels]
residual = self.last_linear(residual) # [b, max_mel_len, num_mels]
mel_postnet_output = mel_output + residual # [b, max_mel_len, num_mels]
mel_postnet_output = self.mask_tensor(mel_postnet_output,
mel_pos,
mel_max_length) # [b, max_mel_len, num_mels]
return mel_output, mel_postnet_output, duration_predictor_output # 用于训练duration predictor
else: # 推理
# 推理时直接使用训练后duration predictor输出的音素持续时间进行音素序列长度调整
length_regulator_output, decoder_pos = self.length_regulator(encoder_output,
alpha=alpha)
decoder_output = self.decoder(length_regulator_output, decoder_pos)
mel_output = self.mel_linear(decoder_output)
residual = self.postnet(mel_output)
residual = self.last_linear(residual)
mel_postnet_output = mel_output + residual
return mel_output, mel_postnet_output
if __name__ == "__main__":
# Test
model = FastSpeech()
print(sum(param.numel() for param in model.parameters()))
本笔记主要记录所选择的fastspeech复现仓库中模型构建相关的代码,结合之前FastSppech论文阅读笔记笔记中的模型部分进行理解。本笔记主要是对代码进行详细的注释,读者若发现问题或错误,请评论指出,互相学习。