mirror of https://github.com/malarinv/tacotron2
parent
d0d273a698
commit
36c731cad0
216
glow.py
216
glow.py
|
|
@ -1,4 +1,3 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# *****************************************************************************
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
|
|
@ -13,18 +12,19 @@
|
|||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# *****************************************************************************
|
||||
import copy
|
||||
import torch
|
||||
from torch.autograd import Variable
|
||||
import torch.nn.functional as F
|
||||
|
|
@ -33,9 +33,9 @@ import torch.nn.functional as F
|
|||
@torch.jit.script
|
||||
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
||||
n_channels_int = n_channels[0]
|
||||
in_act = input_a + input_b
|
||||
t_act = torch.nn.functional.tanh(in_act[:, :n_channels_int, :])
|
||||
s_act = torch.nn.functional.sigmoid(in_act[:, n_channels_int:, :])
|
||||
in_act = input_a+input_b
|
||||
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
||||
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
||||
acts = t_act * s_act
|
||||
return acts
|
||||
|
||||
|
|
@ -55,12 +55,8 @@ class WaveGlowLoss(torch.nn.Module):
|
|||
log_s_total = log_s_total + torch.sum(log_s)
|
||||
log_det_W_total += log_det_W_list[i]
|
||||
|
||||
loss = (
|
||||
torch.sum(z * z) / (2 * self.sigma * self.sigma)
|
||||
- log_s_total
|
||||
- log_det_W_total
|
||||
)
|
||||
return loss / (z.size(0) * z.size(1) * z.size(2))
|
||||
loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total
|
||||
return loss/(z.size(0)*z.size(1)*z.size(2))
|
||||
|
||||
|
||||
class Invertible1x1Conv(torch.nn.Module):
|
||||
|
|
@ -69,19 +65,17 @@ class Invertible1x1Conv(torch.nn.Module):
|
|||
of its weight matrix. If reverse=True it does convolution with
|
||||
inverse
|
||||
"""
|
||||
|
||||
def __init__(self, c):
|
||||
super(Invertible1x1Conv, self).__init__()
|
||||
self.conv = torch.nn.Conv1d(
|
||||
c, c, kernel_size=1, stride=1, padding=0, bias=False
|
||||
)
|
||||
self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0,
|
||||
bias=False)
|
||||
|
||||
# Sample a random orthonormal matrix to initialize weights
|
||||
W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
|
||||
|
||||
# Ensure determinant is 1.0 not -1.0
|
||||
if torch.det(W) < 0:
|
||||
W[:, 0] = -1 * W[:, 0]
|
||||
W[:,0] = -1*W[:,0]
|
||||
W = W.view(c, c, 1)
|
||||
self.conv.weight.data = W
|
||||
|
||||
|
|
@ -92,11 +86,11 @@ class Invertible1x1Conv(torch.nn.Module):
|
|||
W = self.conv.weight.squeeze()
|
||||
|
||||
if reverse:
|
||||
if not hasattr(self, "W_inverse"):
|
||||
if not hasattr(self, 'W_inverse'):
|
||||
# Reverse computation
|
||||
W_inverse = W.inverse()
|
||||
W_inverse = W.float().inverse()
|
||||
W_inverse = Variable(W_inverse[..., None])
|
||||
if z.type() == "torch.cuda.HalfTensor":
|
||||
if z.type() == 'torch.HalfTensor':
|
||||
W_inverse = W_inverse.half()
|
||||
self.W_inverse = W_inverse
|
||||
z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
|
||||
|
|
@ -110,102 +104,86 @@ class Invertible1x1Conv(torch.nn.Module):
|
|||
|
||||
class WN(torch.nn.Module):
|
||||
"""
|
||||
This is the WaveNet like layer for the affine coupling. The primary
|
||||
difference from WaveNet is the convolutions need not be causal. There is
|
||||
also no dilation size reset. The dilation only doubles on each layer
|
||||
This is the WaveNet like layer for the affine coupling. The primary difference
|
||||
from WaveNet is the convolutions need not be causal. There is also no dilation
|
||||
size reset. The dilation only doubles on each layer
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, n_in_channels, n_mel_channels, n_layers, n_channels, kernel_size
|
||||
):
|
||||
def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
|
||||
kernel_size):
|
||||
super(WN, self).__init__()
|
||||
assert kernel_size % 2 == 1
|
||||
assert n_channels % 2 == 0
|
||||
assert(kernel_size % 2 == 1)
|
||||
assert(n_channels % 2 == 0)
|
||||
self.n_layers = n_layers
|
||||
self.n_channels = n_channels
|
||||
self.in_layers = torch.nn.ModuleList()
|
||||
self.res_skip_layers = torch.nn.ModuleList()
|
||||
self.cond_layers = torch.nn.ModuleList()
|
||||
|
||||
start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
|
||||
start = torch.nn.utils.weight_norm(start, name="weight")
|
||||
start = torch.nn.utils.weight_norm(start, name='weight')
|
||||
self.start = start
|
||||
|
||||
# Initializing last layer to 0 makes the affine coupling layers
|
||||
# do nothing at first. This helps with training stability
|
||||
end = torch.nn.Conv1d(n_channels, 2 * n_in_channels, 1)
|
||||
end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1)
|
||||
end.weight.data.zero_()
|
||||
end.bias.data.zero_()
|
||||
self.end = end
|
||||
|
||||
cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels*n_layers, 1)
|
||||
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
|
||||
|
||||
for i in range(n_layers):
|
||||
dilation = 2 ** i
|
||||
padding = int((kernel_size * dilation - dilation) / 2)
|
||||
in_layer = torch.nn.Conv1d(
|
||||
n_channels,
|
||||
2 * n_channels,
|
||||
kernel_size,
|
||||
dilation=dilation,
|
||||
padding=padding,
|
||||
)
|
||||
in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
|
||||
padding = int((kernel_size*dilation - dilation)/2)
|
||||
in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size,
|
||||
dilation=dilation, padding=padding)
|
||||
in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
|
||||
self.in_layers.append(in_layer)
|
||||
|
||||
cond_layer = torch.nn.Conv1d(n_mel_channels, 2 * n_channels, 1)
|
||||
cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
|
||||
self.cond_layers.append(cond_layer)
|
||||
|
||||
# last one is not necessary
|
||||
if i < n_layers - 1:
|
||||
res_skip_channels = 2 * n_channels
|
||||
res_skip_channels = 2*n_channels
|
||||
else:
|
||||
res_skip_channels = n_channels
|
||||
res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
|
||||
res_skip_layer = torch.nn.utils.weight_norm(
|
||||
res_skip_layer, name="weight"
|
||||
)
|
||||
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
|
||||
self.res_skip_layers.append(res_skip_layer)
|
||||
|
||||
def forward(self, forward_input):
|
||||
audio, spect = forward_input
|
||||
audio = self.start(audio)
|
||||
output = torch.zeros_like(audio)
|
||||
n_channels_tensor = torch.IntTensor([self.n_channels])
|
||||
|
||||
spect = self.cond_layer(spect)
|
||||
|
||||
for i in range(self.n_layers):
|
||||
spect_offset = i*2*self.n_channels
|
||||
acts = fused_add_tanh_sigmoid_multiply(
|
||||
self.in_layers[i](audio),
|
||||
self.cond_layers[i](spect),
|
||||
torch.IntTensor([self.n_channels]),
|
||||
)
|
||||
spect[:,spect_offset:spect_offset+2*self.n_channels,:],
|
||||
n_channels_tensor)
|
||||
|
||||
res_skip_acts = self.res_skip_layers[i](acts)
|
||||
if i < self.n_layers - 1:
|
||||
audio = res_skip_acts[:, : self.n_channels, :] + audio
|
||||
skip_acts = res_skip_acts[:, self.n_channels :, :]
|
||||
audio = audio + res_skip_acts[:,:self.n_channels,:]
|
||||
output = output + res_skip_acts[:,self.n_channels:,:]
|
||||
else:
|
||||
skip_acts = res_skip_acts
|
||||
output = output + res_skip_acts
|
||||
|
||||
if i == 0:
|
||||
output = skip_acts
|
||||
else:
|
||||
output = skip_acts + output
|
||||
return self.end(output)
|
||||
|
||||
|
||||
class WaveGlow(torch.nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
n_mel_channels,
|
||||
n_flows,
|
||||
n_group,
|
||||
n_early_every,
|
||||
n_early_size,
|
||||
WN_config,
|
||||
):
|
||||
def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
|
||||
n_early_size, WN_config):
|
||||
super(WaveGlow, self).__init__()
|
||||
|
||||
self.upsample = torch.nn.ConvTranspose1d(
|
||||
n_mel_channels, n_mel_channels, 1024, stride=256
|
||||
)
|
||||
assert n_group % 2 == 0
|
||||
self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
|
||||
n_mel_channels,
|
||||
1024, stride=256)
|
||||
assert(n_group % 2 == 0)
|
||||
self.n_flows = n_flows
|
||||
self.n_group = n_group
|
||||
self.n_early_every = n_early_every
|
||||
|
|
@ -213,19 +191,18 @@ class WaveGlow(torch.nn.Module):
|
|||
self.WN = torch.nn.ModuleList()
|
||||
self.convinv = torch.nn.ModuleList()
|
||||
|
||||
n_half = int(n_group / 2)
|
||||
n_half = int(n_group/2)
|
||||
|
||||
# Set up layers with the right sizes based on how many dimensions
|
||||
# have been output already
|
||||
n_remaining_channels = n_group
|
||||
for k in range(n_flows):
|
||||
if k % self.n_early_every == 0 and k > 0:
|
||||
n_half = n_half - int(self.n_early_size / 2)
|
||||
n_half = n_half - int(self.n_early_size/2)
|
||||
n_remaining_channels = n_remaining_channels - self.n_early_size
|
||||
self.convinv.append(Invertible1x1Conv(n_remaining_channels))
|
||||
self.WN.append(WN(n_half, n_mel_channels * n_group, **WN_config))
|
||||
self.n_remaining_channels = n_remaining_channels
|
||||
# Useful during inference
|
||||
self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config))
|
||||
self.n_remaining_channels = n_remaining_channels # Useful during inference
|
||||
|
||||
def forward(self, forward_input):
|
||||
"""
|
||||
|
|
@ -236,16 +213,12 @@ class WaveGlow(torch.nn.Module):
|
|||
|
||||
# Upsample spectrogram to size of audio
|
||||
spect = self.upsample(spect)
|
||||
assert spect.size(2) >= audio.size(1)
|
||||
assert(spect.size(2) >= audio.size(1))
|
||||
if spect.size(2) > audio.size(1):
|
||||
spect = spect[:, :, : audio.size(1)]
|
||||
spect = spect[:, :, :audio.size(1)]
|
||||
|
||||
spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
|
||||
spect = (
|
||||
spect.contiguous()
|
||||
.view(spect.size(0), spect.size(1), -1)
|
||||
.permute(0, 2, 1)
|
||||
)
|
||||
spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
|
||||
|
||||
audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
|
||||
output_audio = []
|
||||
|
|
@ -254,26 +227,26 @@ class WaveGlow(torch.nn.Module):
|
|||
|
||||
for k in range(self.n_flows):
|
||||
if k % self.n_early_every == 0 and k > 0:
|
||||
output_audio.append(audio[:, : self.n_early_size, :])
|
||||
audio = audio[:, self.n_early_size :, :]
|
||||
output_audio.append(audio[:,:self.n_early_size,:])
|
||||
audio = audio[:,self.n_early_size:,:]
|
||||
|
||||
audio, log_det_W = self.convinv[k](audio)
|
||||
log_det_W_list.append(log_det_W)
|
||||
|
||||
n_half = int(audio.size(1) / 2)
|
||||
audio_0 = audio[:, :n_half, :]
|
||||
audio_1 = audio[:, n_half:, :]
|
||||
n_half = int(audio.size(1)/2)
|
||||
audio_0 = audio[:,:n_half,:]
|
||||
audio_1 = audio[:,n_half:,:]
|
||||
|
||||
output = self.WN[k]((audio_0, spect))
|
||||
log_s = output[:, n_half:, :]
|
||||
b = output[:, :n_half, :]
|
||||
audio_1 = torch.exp(log_s) * audio_1 + b
|
||||
audio_1 = torch.exp(log_s)*audio_1 + b
|
||||
log_s_list.append(log_s)
|
||||
|
||||
audio = torch.cat([audio_0, audio_1], 1)
|
||||
audio = torch.cat([audio_0, audio_1],1)
|
||||
|
||||
output_audio.append(audio)
|
||||
return torch.cat(output_audio, 1), log_s_list, log_det_W_list
|
||||
return torch.cat(output_audio,1), log_s_list, log_det_W_list
|
||||
|
||||
def infer(self, spect, sigma=1.0):
|
||||
spect = self.upsample(spect)
|
||||
|
|
@ -282,52 +255,41 @@ class WaveGlow(torch.nn.Module):
|
|||
spect = spect[:, :, :-time_cutoff]
|
||||
|
||||
spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
|
||||
spect = (
|
||||
spect.contiguous()
|
||||
.view(spect.size(0), spect.size(1), -1)
|
||||
.permute(0, 2, 1)
|
||||
)
|
||||
spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
|
||||
|
||||
if spect.type() == "torch.cuda.HalfTensor":
|
||||
audio = torch.cuda.HalfTensor(
|
||||
spect.size(0), self.n_remaining_channels, spect.size(2)
|
||||
).normal_()
|
||||
if spect.type() == 'torch.HalfTensor':
|
||||
audio = torch.HalfTensor(spect.size(0),
|
||||
self.n_remaining_channels,
|
||||
spect.size(2)).normal_()
|
||||
else:
|
||||
# cuda.FloatTensor -> FloatTensor
|
||||
audio = torch.FloatTensor(
|
||||
spect.size(0), self.n_remaining_channels, spect.size(2)
|
||||
).normal_()
|
||||
audio = torch.FloatTensor(spect.size(0),
|
||||
self.n_remaining_channels,
|
||||
spect.size(2)).normal_()
|
||||
|
||||
audio = torch.autograd.Variable(sigma * audio)
|
||||
audio = torch.autograd.Variable(sigma*audio)
|
||||
|
||||
for k in reversed(range(self.n_flows)):
|
||||
n_half = int(audio.size(1) / 2)
|
||||
audio_0 = audio[:, :n_half, :]
|
||||
audio_1 = audio[:, n_half:, :]
|
||||
n_half = int(audio.size(1)/2)
|
||||
audio_0 = audio[:,:n_half,:]
|
||||
audio_1 = audio[:,n_half:,:]
|
||||
|
||||
output = self.WN[k]((audio_0, spect))
|
||||
|
||||
s = output[:, n_half:, :]
|
||||
b = output[:, :n_half, :]
|
||||
audio_1 = (audio_1 - b) / torch.exp(s)
|
||||
audio = torch.cat([audio_0, audio_1], 1)
|
||||
audio_1 = (audio_1 - b)/torch.exp(s)
|
||||
audio = torch.cat([audio_0, audio_1],1)
|
||||
|
||||
audio = self.convinv[k](audio, reverse=True)
|
||||
|
||||
if k % self.n_early_every == 0 and k > 0:
|
||||
if spect.type() == "torch.cuda.HalfTensor":
|
||||
z = torch.cuda.HalfTensor(
|
||||
spect.size(0), self.n_early_size, spect.size(2)
|
||||
).normal_()
|
||||
if spect.type() == 'torch.HalfTensor':
|
||||
z = torch.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
|
||||
else:
|
||||
# cuda.FloatTensor -> FloatTensor
|
||||
z = torch.FloatTensor(
|
||||
spect.size(0), self.n_early_size, spect.size(2)
|
||||
).normal_()
|
||||
audio = torch.cat((sigma * z, audio), 1)
|
||||
z = torch.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
|
||||
audio = torch.cat((sigma*z, audio),1)
|
||||
|
||||
audio = (
|
||||
audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data
|
||||
)
|
||||
audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
|
||||
return audio
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -336,7 +298,7 @@ class WaveGlow(torch.nn.Module):
|
|||
for WN in waveglow.WN:
|
||||
WN.start = torch.nn.utils.remove_weight_norm(WN.start)
|
||||
WN.in_layers = remove(WN.in_layers)
|
||||
WN.cond_layers = remove(WN.cond_layers)
|
||||
WN.cond_layer = torch.nn.utils.remove_weight_norm(WN.cond_layer)
|
||||
WN.res_skip_layers = remove(WN.res_skip_layers)
|
||||
return waveglow
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,349 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# *****************************************************************************
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the NVIDIA CORPORATION nor the
|
||||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# *****************************************************************************
|
||||
import torch
|
||||
from torch.autograd import Variable
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
@torch.jit.script
|
||||
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
||||
n_channels_int = n_channels[0]
|
||||
in_act = input_a + input_b
|
||||
t_act = torch.nn.functional.tanh(in_act[:, :n_channels_int, :])
|
||||
s_act = torch.nn.functional.sigmoid(in_act[:, n_channels_int:, :])
|
||||
acts = t_act * s_act
|
||||
return acts
|
||||
|
||||
|
||||
class WaveGlowLoss(torch.nn.Module):
|
||||
def __init__(self, sigma=1.0):
|
||||
super(WaveGlowLoss, self).__init__()
|
||||
self.sigma = sigma
|
||||
|
||||
def forward(self, model_output):
|
||||
z, log_s_list, log_det_W_list = model_output
|
||||
for i, log_s in enumerate(log_s_list):
|
||||
if i == 0:
|
||||
log_s_total = torch.sum(log_s)
|
||||
log_det_W_total = log_det_W_list[i]
|
||||
else:
|
||||
log_s_total = log_s_total + torch.sum(log_s)
|
||||
log_det_W_total += log_det_W_list[i]
|
||||
|
||||
loss = (
|
||||
torch.sum(z * z) / (2 * self.sigma * self.sigma)
|
||||
- log_s_total
|
||||
- log_det_W_total
|
||||
)
|
||||
return loss / (z.size(0) * z.size(1) * z.size(2))
|
||||
|
||||
|
||||
class Invertible1x1Conv(torch.nn.Module):
|
||||
"""
|
||||
The layer outputs both the convolution, and the log determinant
|
||||
of its weight matrix. If reverse=True it does convolution with
|
||||
inverse
|
||||
"""
|
||||
|
||||
def __init__(self, c):
|
||||
super(Invertible1x1Conv, self).__init__()
|
||||
self.conv = torch.nn.Conv1d(
|
||||
c, c, kernel_size=1, stride=1, padding=0, bias=False
|
||||
)
|
||||
|
||||
# Sample a random orthonormal matrix to initialize weights
|
||||
W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
|
||||
|
||||
# Ensure determinant is 1.0 not -1.0
|
||||
if torch.det(W) < 0:
|
||||
W[:, 0] = -1 * W[:, 0]
|
||||
W = W.view(c, c, 1)
|
||||
self.conv.weight.data = W
|
||||
|
||||
def forward(self, z, reverse=False):
|
||||
# shape
|
||||
batch_size, group_size, n_of_groups = z.size()
|
||||
|
||||
W = self.conv.weight.squeeze()
|
||||
|
||||
if reverse:
|
||||
if not hasattr(self, "W_inverse"):
|
||||
# Reverse computation
|
||||
W_inverse = W.inverse()
|
||||
W_inverse = Variable(W_inverse[..., None])
|
||||
if z.type() == "torch.cuda.HalfTensor":
|
||||
W_inverse = W_inverse.half()
|
||||
self.W_inverse = W_inverse
|
||||
z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
|
||||
return z
|
||||
else:
|
||||
# Forward computation
|
||||
log_det_W = batch_size * n_of_groups * torch.logdet(W)
|
||||
z = self.conv(z)
|
||||
return z, log_det_W
|
||||
|
||||
|
||||
class WN(torch.nn.Module):
|
||||
"""
|
||||
This is the WaveNet like layer for the affine coupling. The primary
|
||||
difference from WaveNet is the convolutions need not be causal. There is
|
||||
also no dilation size reset. The dilation only doubles on each layer
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, n_in_channels, n_mel_channels, n_layers, n_channels, kernel_size
|
||||
):
|
||||
super(WN, self).__init__()
|
||||
assert kernel_size % 2 == 1
|
||||
assert n_channels % 2 == 0
|
||||
self.n_layers = n_layers
|
||||
self.n_channels = n_channels
|
||||
self.in_layers = torch.nn.ModuleList()
|
||||
self.res_skip_layers = torch.nn.ModuleList()
|
||||
self.cond_layers = torch.nn.ModuleList()
|
||||
|
||||
start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
|
||||
start = torch.nn.utils.weight_norm(start, name="weight")
|
||||
self.start = start
|
||||
|
||||
# Initializing last layer to 0 makes the affine coupling layers
|
||||
# do nothing at first. This helps with training stability
|
||||
end = torch.nn.Conv1d(n_channels, 2 * n_in_channels, 1)
|
||||
end.weight.data.zero_()
|
||||
end.bias.data.zero_()
|
||||
self.end = end
|
||||
|
||||
for i in range(n_layers):
|
||||
dilation = 2 ** i
|
||||
padding = int((kernel_size * dilation - dilation) / 2)
|
||||
in_layer = torch.nn.Conv1d(
|
||||
n_channels,
|
||||
2 * n_channels,
|
||||
kernel_size,
|
||||
dilation=dilation,
|
||||
padding=padding,
|
||||
)
|
||||
in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
|
||||
self.in_layers.append(in_layer)
|
||||
|
||||
cond_layer = torch.nn.Conv1d(n_mel_channels, 2 * n_channels, 1)
|
||||
cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
|
||||
self.cond_layers.append(cond_layer)
|
||||
|
||||
# last one is not necessary
|
||||
if i < n_layers - 1:
|
||||
res_skip_channels = 2 * n_channels
|
||||
else:
|
||||
res_skip_channels = n_channels
|
||||
res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
|
||||
res_skip_layer = torch.nn.utils.weight_norm(
|
||||
res_skip_layer, name="weight"
|
||||
)
|
||||
self.res_skip_layers.append(res_skip_layer)
|
||||
|
||||
def forward(self, forward_input):
|
||||
audio, spect = forward_input
|
||||
audio = self.start(audio)
|
||||
for i in range(self.n_layers):
|
||||
acts = fused_add_tanh_sigmoid_multiply(
|
||||
self.in_layers[i](audio),
|
||||
self.cond_layers[i](spect),
|
||||
torch.IntTensor([self.n_channels]),
|
||||
)
|
||||
|
||||
res_skip_acts = self.res_skip_layers[i](acts)
|
||||
if i < self.n_layers - 1:
|
||||
audio = res_skip_acts[:, : self.n_channels, :] + audio
|
||||
skip_acts = res_skip_acts[:, self.n_channels :, :]
|
||||
else:
|
||||
skip_acts = res_skip_acts
|
||||
|
||||
if i == 0:
|
||||
output = skip_acts
|
||||
else:
|
||||
output = skip_acts + output
|
||||
return self.end(output)
|
||||
|
||||
|
||||
class WaveGlow(torch.nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
n_mel_channels,
|
||||
n_flows,
|
||||
n_group,
|
||||
n_early_every,
|
||||
n_early_size,
|
||||
WN_config,
|
||||
):
|
||||
super(WaveGlow, self).__init__()
|
||||
|
||||
self.upsample = torch.nn.ConvTranspose1d(
|
||||
n_mel_channels, n_mel_channels, 1024, stride=256
|
||||
)
|
||||
assert n_group % 2 == 0
|
||||
self.n_flows = n_flows
|
||||
self.n_group = n_group
|
||||
self.n_early_every = n_early_every
|
||||
self.n_early_size = n_early_size
|
||||
self.WN = torch.nn.ModuleList()
|
||||
self.convinv = torch.nn.ModuleList()
|
||||
|
||||
n_half = int(n_group / 2)
|
||||
|
||||
# Set up layers with the right sizes based on how many dimensions
|
||||
# have been output already
|
||||
n_remaining_channels = n_group
|
||||
for k in range(n_flows):
|
||||
if k % self.n_early_every == 0 and k > 0:
|
||||
n_half = n_half - int(self.n_early_size / 2)
|
||||
n_remaining_channels = n_remaining_channels - self.n_early_size
|
||||
self.convinv.append(Invertible1x1Conv(n_remaining_channels))
|
||||
self.WN.append(WN(n_half, n_mel_channels * n_group, **WN_config))
|
||||
self.n_remaining_channels = n_remaining_channels
|
||||
# Useful during inference
|
||||
|
||||
def forward(self, forward_input):
|
||||
"""
|
||||
forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames
|
||||
forward_input[1] = audio: batch x time
|
||||
"""
|
||||
spect, audio = forward_input
|
||||
|
||||
# Upsample spectrogram to size of audio
|
||||
spect = self.upsample(spect)
|
||||
assert spect.size(2) >= audio.size(1)
|
||||
if spect.size(2) > audio.size(1):
|
||||
spect = spect[:, :, : audio.size(1)]
|
||||
|
||||
spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
|
||||
spect = (
|
||||
spect.contiguous()
|
||||
.view(spect.size(0), spect.size(1), -1)
|
||||
.permute(0, 2, 1)
|
||||
)
|
||||
|
||||
audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
|
||||
output_audio = []
|
||||
log_s_list = []
|
||||
log_det_W_list = []
|
||||
|
||||
for k in range(self.n_flows):
|
||||
if k % self.n_early_every == 0 and k > 0:
|
||||
output_audio.append(audio[:, : self.n_early_size, :])
|
||||
audio = audio[:, self.n_early_size :, :]
|
||||
|
||||
audio, log_det_W = self.convinv[k](audio)
|
||||
log_det_W_list.append(log_det_W)
|
||||
|
||||
n_half = int(audio.size(1) / 2)
|
||||
audio_0 = audio[:, :n_half, :]
|
||||
audio_1 = audio[:, n_half:, :]
|
||||
|
||||
output = self.WN[k]((audio_0, spect))
|
||||
log_s = output[:, n_half:, :]
|
||||
b = output[:, :n_half, :]
|
||||
audio_1 = torch.exp(log_s) * audio_1 + b
|
||||
log_s_list.append(log_s)
|
||||
|
||||
audio = torch.cat([audio_0, audio_1], 1)
|
||||
|
||||
output_audio.append(audio)
|
||||
return torch.cat(output_audio, 1), log_s_list, log_det_W_list
|
||||
|
||||
def infer(self, spect, sigma=1.0):
|
||||
spect = self.upsample(spect)
|
||||
# trim conv artifacts. maybe pad spec to kernel multiple
|
||||
time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
|
||||
spect = spect[:, :, :-time_cutoff]
|
||||
|
||||
spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
|
||||
spect = (
|
||||
spect.contiguous()
|
||||
.view(spect.size(0), spect.size(1), -1)
|
||||
.permute(0, 2, 1)
|
||||
)
|
||||
|
||||
if spect.type() == "torch.cuda.HalfTensor":
|
||||
audio = torch.cuda.HalfTensor(
|
||||
spect.size(0), self.n_remaining_channels, spect.size(2)
|
||||
).normal_()
|
||||
else:
|
||||
# cuda.FloatTensor -> FloatTensor
|
||||
audio = torch.FloatTensor(
|
||||
spect.size(0), self.n_remaining_channels, spect.size(2)
|
||||
).normal_()
|
||||
|
||||
audio = torch.autograd.Variable(sigma * audio)
|
||||
|
||||
for k in reversed(range(self.n_flows)):
|
||||
n_half = int(audio.size(1) / 2)
|
||||
audio_0 = audio[:, :n_half, :]
|
||||
audio_1 = audio[:, n_half:, :]
|
||||
|
||||
output = self.WN[k]((audio_0, spect))
|
||||
s = output[:, n_half:, :]
|
||||
b = output[:, :n_half, :]
|
||||
audio_1 = (audio_1 - b) / torch.exp(s)
|
||||
audio = torch.cat([audio_0, audio_1], 1)
|
||||
|
||||
audio = self.convinv[k](audio, reverse=True)
|
||||
|
||||
if k % self.n_early_every == 0 and k > 0:
|
||||
if spect.type() == "torch.cuda.HalfTensor":
|
||||
z = torch.cuda.HalfTensor(
|
||||
spect.size(0), self.n_early_size, spect.size(2)
|
||||
).normal_()
|
||||
else:
|
||||
# cuda.FloatTensor -> FloatTensor
|
||||
z = torch.FloatTensor(
|
||||
spect.size(0), self.n_early_size, spect.size(2)
|
||||
).normal_()
|
||||
audio = torch.cat((sigma * z, audio), 1)
|
||||
|
||||
audio = (
|
||||
audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data
|
||||
)
|
||||
return audio
|
||||
|
||||
@staticmethod
|
||||
def remove_weightnorm(model):
|
||||
waveglow = model
|
||||
for WN in waveglow.WN:
|
||||
WN.start = torch.nn.utils.remove_weight_norm(WN.start)
|
||||
WN.in_layers = remove(WN.in_layers)
|
||||
WN.cond_layers = remove(WN.cond_layers)
|
||||
WN.res_skip_layers = remove(WN.res_skip_layers)
|
||||
return waveglow
|
||||
|
||||
|
||||
def remove(conv_list):
|
||||
new_conv_list = torch.nn.ModuleList()
|
||||
for old_conv in conv_list:
|
||||
old_conv = torch.nn.utils.remove_weight_norm(old_conv)
|
||||
new_conv_list.append(old_conv)
|
||||
return new_conv_list
|
||||
|
|
@ -7,19 +7,19 @@ class Denoiser(torch.nn.Module):
|
|||
""" Removes model bias from audio produced with waveglow """
|
||||
|
||||
def __init__(self, waveglow, filter_length=1024, n_overlap=4,
|
||||
win_length=1024, mode='zeros'):
|
||||
win_length=1024, mode='zeros', n_mel_channels=80,):
|
||||
super(Denoiser, self).__init__()
|
||||
self.stft = STFT(filter_length=filter_length,
|
||||
hop_length=int(filter_length/n_overlap),
|
||||
win_length=win_length).cpu()
|
||||
if mode == 'zeros':
|
||||
mel_input = torch.zeros(
|
||||
(1, 80, 88),
|
||||
(1, n_mel_channels, 88),
|
||||
dtype=waveglow.upsample.weight.dtype,
|
||||
device=waveglow.upsample.weight.device)
|
||||
elif mode == 'normal':
|
||||
mel_input = torch.randn(
|
||||
(1, 80, 88),
|
||||
(1, n_mel_channels, 88),
|
||||
dtype=waveglow.upsample.weight.dtype,
|
||||
device=waveglow.upsample.weight.device)
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -2,76 +2,79 @@
|
|||
# import tensorflow as tf
|
||||
from dataclasses import dataclass
|
||||
from .text import symbols
|
||||
|
||||
# from .text_codec import symbols
|
||||
|
||||
|
||||
@dataclass
|
||||
class HParams(object):
|
||||
"""docstring for HParams."""
|
||||
|
||||
################################
|
||||
# Experiment Parameters #
|
||||
################################
|
||||
epochs=500
|
||||
iters_per_checkpoint=1000
|
||||
seed=1234
|
||||
dynamic_loss_scaling=True
|
||||
fp16_run=False
|
||||
distributed_run=False
|
||||
dist_backend="nccl"
|
||||
dist_url="tcp://localhost:54321"
|
||||
cudnn_enabled=True
|
||||
cudnn_benchmark=False
|
||||
ignore_layers=["embedding.weight"]
|
||||
epochs = 500
|
||||
iters_per_checkpoint = 1000
|
||||
seed = 1234
|
||||
dynamic_loss_scaling = True
|
||||
fp16_run = False
|
||||
distributed_run = False
|
||||
dist_backend = "nccl"
|
||||
dist_url = "tcp://localhost:54321"
|
||||
cudnn_enabled = True
|
||||
cudnn_benchmark = False
|
||||
ignore_layers = ["embedding.weight"]
|
||||
################################
|
||||
# Data Parameters #
|
||||
################################
|
||||
load_mel_from_disk=False
|
||||
training_files="lists/tts_data_train_processed.txt"
|
||||
validation_files="filelists/tts_data_val_processed.txt"
|
||||
text_cleaners=["english_cleaners"]
|
||||
load_mel_from_disk = False
|
||||
training_files = "lists/tts_data_train_processed.txt"
|
||||
validation_files = "filelists/tts_data_val_processed.txt"
|
||||
text_cleaners = ["english_cleaners"]
|
||||
################################
|
||||
# Audio Parameters #
|
||||
################################
|
||||
max_wav_value=32768.0
|
||||
sampling_rate=16000
|
||||
filter_length=1024
|
||||
hop_length=256
|
||||
win_length=1024
|
||||
n_mel_channels=80
|
||||
mel_fmin=0.0
|
||||
mel_fmax=8000.0
|
||||
max_wav_value = 32768.0
|
||||
sampling_rate = 16000
|
||||
filter_length = 1024
|
||||
hop_length = 256
|
||||
win_length = 1024
|
||||
n_mel_channels: int = 40
|
||||
mel_fmin: float = 0.0
|
||||
mel_fmax: float = 4000.0
|
||||
################################
|
||||
# Model Parameters #
|
||||
################################
|
||||
n_symbols=len(symbols)
|
||||
symbols_embedding_dim=512
|
||||
n_symbols = len(symbols)
|
||||
symbols_embedding_dim = 512
|
||||
# Encoder parameters
|
||||
encoder_kernel_size=5
|
||||
encoder_n_convolutions=3
|
||||
encoder_embedding_dim=512
|
||||
encoder_kernel_size = 5
|
||||
encoder_n_convolutions = 3
|
||||
encoder_embedding_dim = 512
|
||||
# Decoder parameters
|
||||
n_frames_per_step=1 # currently only 1 is supported
|
||||
decoder_rnn_dim=1024
|
||||
prenet_dim=256
|
||||
max_decoder_steps=1000
|
||||
gate_threshold=0.5
|
||||
p_attention_dropout=0.1
|
||||
p_decoder_dropout=0.1
|
||||
n_frames_per_step = 1 # currently only 1 is supported
|
||||
decoder_rnn_dim = 1024
|
||||
prenet_dim = 256
|
||||
max_decoder_steps = 1000
|
||||
gate_threshold = 0.5
|
||||
p_attention_dropout = 0.1
|
||||
p_decoder_dropout = 0.1
|
||||
# Attention parameters
|
||||
attention_rnn_dim=1024
|
||||
attention_dim=128
|
||||
attention_rnn_dim = 1024
|
||||
attention_dim = 128
|
||||
# Location Layer parameters
|
||||
attention_location_n_filters=32
|
||||
attention_location_kernel_size=31
|
||||
attention_location_n_filters = 32
|
||||
attention_location_kernel_size = 31
|
||||
# Mel-post processing network parameters
|
||||
postnet_embedding_dim=512
|
||||
postnet_kernel_size=5
|
||||
postnet_n_convolutions=5
|
||||
postnet_embedding_dim = 512
|
||||
postnet_kernel_size = 5
|
||||
postnet_n_convolutions = 5
|
||||
################################
|
||||
# Optimization Hyperparameters #
|
||||
################################
|
||||
use_saved_learning_rate=False
|
||||
learning_rate=1e-3
|
||||
weight_decay=1e-6
|
||||
grad_clip_thresh=1.0
|
||||
batch_size=4
|
||||
mask_padding=True # set model's padded outputs to padded values
|
||||
use_saved_learning_rate = False
|
||||
learning_rate = 1e-3
|
||||
weight_decay = 1e-6
|
||||
grad_clip_thresh = 1.0
|
||||
batch_size = 4
|
||||
mask_padding = True # set model's padded outputs to padded values
|
||||
|
|
|
|||
Loading…
Reference in New Issue