来源:INTERSPEECH 2020
机构:比利时根特大学
论文地址:
源码地址:
论文阅读博客:ECAPA_TDNN 上
MUSAN数据集下载:wget https://www.openslr.org/resources/17/musan.tar.gz
RIR数据集下载:wget https://openslr.org/resources/28/rirs_noises.zip
audio, sr = soundfile.read(self.data_list[index])
#将所有音频调整为一个长度
length = self.num_frames * 160 + 240
if audio.shape[0] <= length:
shortage = length - audio.shape[0]
audio = numpy.pad(audio, (0, shortage), 'wrap')
start_frame = numpy.int64(random.random()*(audio.shape[0]-length))
audio = audio[start_frame:start_frame + length]
audio = numpy.stack([audio],axis=0)
# 数据增强
augtype = random.randint(0,5)
if augtype == 0: # Original
audio = audio
elif augtype == 1: # Reverberation混响
audio = self.add_rev(audio)
elif augtype == 2: # Babble
audio = self.add_noise(audio, 'speech')
elif augtype == 3: # Music
audio = self.add_noise(audio, 'music')
elif augtype == 4: # Noise
audio = self.add_noise(audio, 'noise')
elif augtype == 5: # Television noise
audio = self.add_noise(audio, 'speech')
audio = self.add_noise(audio, 'music')
return torch.FloatTensor(audio[0]), self.data_label[index]
如下为混响增强,随机从数据集中选取混响音频,再增加混响音频的维度与人声音频保持一致,最后对人声音频和混响音频做一个卷积。
#添加混响
def add_rev(self, audio):
rir_file = random.choice(self.rir_files)
rir, sr = soundfile.read(rir_file)
rir = numpy.expand_dims(rir.astype(numpy.float),0)
rir = rir / numpy.sqrt(numpy.sum(rir**2))
return signal.convolve(audio, rir, mode='full')[:,:self.num_frames * 160 + 240]
如下为噪声增强,先获得人声音频的db,随机选出n个噪声音频,将噪声音频长度调整至与人声音频一致,再获得噪声音频的db,随机获取noise信噪比,然后计算出噪声系数,并与噪声音频相乘。将所有噪声音频进行concatencate,再与人声音频叠加。
def add_noise(self, audio, noisecat):
#numpy.mean(audio ** 2) 为信号功率
clean_db = 10 * numpy.log10(numpy.mean(audio ** 2)+1e-4)
numnoise = self.numnoise[noisecat]
noiselist = random.sample(self.noiselist[noisecat], random.randint(numnoise[0],numnoise[1]))
noises = []
for noise in noiselist:
#假设噪声音频长度已调整至人声音频一致
noise_db = 10 * numpy.log10(numpy.mean(noiseaudio ** 2)+1e-4)
noisesnr = random.uniform(self.noisesnr[noisecat][0],self.noisesnr[noisecat][1])
#noiseaudio乘以噪声系数
noises.append(numpy.sqrt(10 ** ((clean_db - noise_db - noisesnr) / 10)) * noiseaudio)
noise = numpy.sum(numpy.concatenate(noises,axis=0),axis=0,keepdims=True)
return noise + audio
网络结构如下
- 数据增强
- TDNN block
- 多层特征聚合
- 注意力统计池化
- FC+BN
- 输出
def forward(self, x, aug):
#数据增强
with torch.no_grad():
x = self.torchfbank(x)+1e-6
x = x.log()
x = x - torch.mean(x, dim=-1, keepdim=True)
if aug == True:
x = self.specaug(x)
#相当于一个TDNN block
x = self.conv1(x)
x = self.relu(x)
x = self.bn1(x)
#多层特征聚合
x1 = self.layer1(x)
x2 = self.layer2(x+x1)
x3 = self.layer3(x+x1+x2)
x = self.layer4(torch.cat((x1,x2,x3),dim=1))
x = self.relu(x)
#注意力统计池化
t = x.size()[-1]
global_x = torch.cat((x,torch.mean(x,dim=2,keepdim=True).repeat(1,1,t), torch.sqrt(torch.var(x,dim=2,keepdim=True).clamp(min=1e-4)).repeat(1,1,t)), dim=1)
w = self.attention(global_x)
mu = torch.sum(x * w, dim=2)
sg = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-4) )
x = torch.cat((mu,sg),1)
x = self.bn5(x)
x = self.fc6(x)
x = self.bn6(x)
return x
SpecAugment算法是一种添加掩码的数据增强算法,步骤如下:
- 预加重:PreEmphasis(torch.nn.Module)
- 提取梅尔
torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=80)- 将梅尔进行零均值归一化,可以直接将Mask位置设为0
- 时间维度掩码
- 频率维度掩码
总代码:
with torch.no_grad():
#预加重和提取梅尔
x = self.torchfbank(x)+1e-6
#对数梅尔
x = x.log()
x = x - torch.mean(x, dim=-1, keepdim=True)
if aug == True:
#添加掩码
x = self.specaug(x)
掩码部分主要代码:
1.获取梅尔的维度,分别赋值为batch, fea, time
batch为每批次输入梅尔的数量;
fea为每一个梅尔的特征维度,这里应该为80;
time为每一个梅尔的时间维度
2.掩码的长度:生成[batch, 1, 1]维数组
3.掩码的位置:生成[batch, 1 ,1]维数组,根据长度和梅尔的维度调整
4.生成一个D维张量,并将其增加维度至[1,1,D]
5.根据掩码长度和掩码位置得到掩码:[batch, 1 , D] ->[batch, D] ->[batch, 1 , D] or [batch, D, 1]
6.将梅尔掩码的地方赋值为0
def mask_along_axis(self, x, dim):
original_size = x.shape
batch, fea, time = x.shape
if dim == 1:
D = fea
width_range = self.freq_mask_width
else:
D = time
width_range = self.time_mask_width
mask_len = torch.randint(width_range[0], width_range[1], (batch, 1), device=x.device).unsqueeze(2)
mask_pos = torch.randint(0, max(1, D - mask_len.max()), (batch, 1), device=x.device).unsqueeze(2)
arange = torch.arange(D, device=x.device).view(1, 1, -1)
mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len))
mask = mask.any(dim=1)
if dim == 1:
mask = mask.unsqueeze(2)
else:
mask = mask.unsqueeze(1)
#用0填充张量x中对应mask位置处为True的元素
x = x.masked_fill_(mask, 0.0)
return x.view(*original_size)
主要是通过两个公式计算加权平均和加权标准差:
μ c = ∑ t T α t , c h t , c \mu_{c} = \sum^{T}_{t}\alpha_{t,c}h_{t,c} μc=t∑Tαt,cht,c
σ c = ∑ t T α t , c h t , c 2 − μ c 2 \sigma_{c} = \sqrt{\sum^{T}_{t}\alpha_{t,c}h^{2}_{t,c}-\mu^2_{c}} σc=t∑Tαt,cht,c2−μc2
池化层的最终输出由加权平均μ和加权标准差σ的向量串联得到。
# 得到时间帧
t = x.size()[-1]
# 获取时间帧维度的均值和标准差,然后串联原始数据
mean = torch.mean(x,dim=2,keepdim=True).repeat(1,1,t)
standrad = torch.sqrt(torch.var(x,dim=2,keepdim=True).clamp(min=1e-4)).repeat(1,1,t))
global_x = torch.cat((x, mean, standrad), dim=1)
#通过注意力网络得到注意力矩阵w
w = self.attention(global_x)
self.attention = nn.Sequential(
nn.Conv1d(4608, 256, kernel_size=1),
nn.ReLU(),
nn.BatchNorm1d(256),
nn.Tanh(), # I add this layer
nn.Conv1d(256, 1536, kernel_size=1),
nn.Softmax(dim=2),
)
mu = torch.sum(x * w, dim=2)
sg = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-4) )
x = torch.cat((mu,sg),1)
一维SE blocks,重新缩放帧级特征,得到通道的重要性。
过程为:
1.特征通过全局平均池化进行压缩
2.用两个全连接层,主要是为了应用relu和sigmoid(将输出映射至0和1)。第一个全连接层降低维度,第二个全连接层恢复维度。
3.输出为输入乘以权重矩阵。
def __init__(self, channels, bottleneck=128):
super(SEModule, self).__init__()
self.se = nn.Sequential(
#全局平均池化压缩为1个数
nn.AdaptiveAvgPool1d(1),
nn.Conv1d(channels, bottleneck, kernel_size=1, padding=0),
nn.ReLU(),
nn.Conv1d(bottleneck, channels, kernel_size=1, padding=0),
nn.Sigmoid(),
)
def forward(self, input):
#获得权重矩阵
x = self.se(input)
return input * x
res2net主要是利用细粒度的多尺度信息,产生多个感受野的组合。下面左图是res2net多尺度的具体做法,右图是本论文res2net模块的网络结构。
由左图可得,res2net将传统resnet中的3*3卷积进行了多尺度的解耦,在1 * 1卷积之后对通道进行分组,尺度越大计算开销越大。
由右图可知,包含了扩展卷积和前后密集层,第一个密集层用于降低维度,第二个密集层用于恢复维度,最后由SE模块缩放每一个通道。
本文所用的res2net采用了8尺度,在代码中x1是作为最后一个直接送到。
def forward(self, x):
residual = x
#############################
out = self.conv1(x)
out = self.relu(out)
out = self.bn1(out)
#############################
#############################
#这里是res2net的核心
spx = torch.split(out, self.width, 1)
#分块卷积计算
for i in range(self.nums):
if i==0:
sp = spx[i]
else:
sp = sp + spx[i]
sp = self.convs[i](sp)
sp = self.relu(sp)
sp = self.bns[i](sp)
if i==0:
out = sp
else:
out = torch.cat((out, sp), 1)
#cat x1的块
out = torch.cat((out, spx[self.nums]),1)
###############################
###############################
out = self.conv3(out)
out = self.relu(out)
out = self.bn3(out)
###############################
out = self.se(out)
out += residual
return out
MFA是多层特征聚合,将SE Res2Blocks输出特征映射连接起来
整体代码
x1 = self.layer1(x)
x2 = self.layer2(x+x1)
x3 = self.layer3(x+x1+x2)
x = self.layer4(torch.cat((x1,x2,x3),dim=1))
x = self.relu(x)
详细介绍见:https://blog.csdn.net/qq_39478403/article/details/116788113
加性角度边界损失
最早用于人脸识别任务。
原理:最大化类间间距,最小化类内间距。
softmax loss在决策边界产生明显的模糊性,但是AAMsoftmax通过添加加性角度边距
可以扩大类间的间隙。
cosine = F.linear(F.normalize(x), F.normalize(self.weight))
sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1))
phi = cosine * self.cos_m - sine * self.sin_m
phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm)
one_hot = torch.zeros_like(cosine) #全0矩阵
one_hot.scatter_(1, label.view(-1, 1), 1) #在label索引上用1替换0
output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
output = output * self.s
loss = self.ce(output, label)
self.ce = nn.CrossEntropyLoss()