diff --git a/glow.py b/glow.py index e6060c8..7c8e46c 100644 --- a/glow.py +++ b/glow.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # ***************************************************************************** # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # @@ -13,18 +12,19 @@ # names of its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF -# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # ***************************************************************************** +import copy import torch from torch.autograd import Variable import torch.nn.functional as F @@ -33,9 +33,9 @@ import torch.nn.functional as F @torch.jit.script def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): n_channels_int = n_channels[0] - in_act = input_a + input_b - t_act = torch.nn.functional.tanh(in_act[:, :n_channels_int, :]) - s_act = torch.nn.functional.sigmoid(in_act[:, n_channels_int:, :]) + in_act = input_a+input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) acts = t_act * s_act return acts @@ -55,12 +55,8 @@ class WaveGlowLoss(torch.nn.Module): log_s_total = log_s_total + torch.sum(log_s) log_det_W_total += log_det_W_list[i] - loss = ( - torch.sum(z * z) / (2 * self.sigma * self.sigma) - - log_s_total - - log_det_W_total - ) - return loss / (z.size(0) * z.size(1) * z.size(2)) + loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total + return loss/(z.size(0)*z.size(1)*z.size(2)) class Invertible1x1Conv(torch.nn.Module): @@ -69,19 +65,17 @@ class Invertible1x1Conv(torch.nn.Module): of its weight matrix. If reverse=True it does convolution with inverse """ - def __init__(self, c): super(Invertible1x1Conv, self).__init__() - self.conv = torch.nn.Conv1d( - c, c, kernel_size=1, stride=1, padding=0, bias=False - ) + self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0, + bias=False) # Sample a random orthonormal matrix to initialize weights W = torch.qr(torch.FloatTensor(c, c).normal_())[0] # Ensure determinant is 1.0 not -1.0 if torch.det(W) < 0: - W[:, 0] = -1 * W[:, 0] + W[:,0] = -1*W[:,0] W = W.view(c, c, 1) self.conv.weight.data = W @@ -92,11 +86,11 @@ class Invertible1x1Conv(torch.nn.Module): W = self.conv.weight.squeeze() if reverse: - if not hasattr(self, "W_inverse"): + if not hasattr(self, 'W_inverse'): # Reverse computation - W_inverse = W.inverse() + W_inverse = W.float().inverse() W_inverse = Variable(W_inverse[..., None]) - if z.type() == "torch.cuda.HalfTensor": + if z.type() == 'torch.HalfTensor': W_inverse = W_inverse.half() self.W_inverse = W_inverse z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) @@ -110,102 +104,86 @@ class Invertible1x1Conv(torch.nn.Module): class WN(torch.nn.Module): """ - This is the WaveNet like layer for the affine coupling. The primary - difference from WaveNet is the convolutions need not be causal. There is - also no dilation size reset. The dilation only doubles on each layer + This is the WaveNet like layer for the affine coupling. The primary difference + from WaveNet is the convolutions need not be causal. There is also no dilation + size reset. The dilation only doubles on each layer """ - - def __init__( - self, n_in_channels, n_mel_channels, n_layers, n_channels, kernel_size - ): + def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels, + kernel_size): super(WN, self).__init__() - assert kernel_size % 2 == 1 - assert n_channels % 2 == 0 + assert(kernel_size % 2 == 1) + assert(n_channels % 2 == 0) self.n_layers = n_layers self.n_channels = n_channels self.in_layers = torch.nn.ModuleList() self.res_skip_layers = torch.nn.ModuleList() - self.cond_layers = torch.nn.ModuleList() start = torch.nn.Conv1d(n_in_channels, n_channels, 1) - start = torch.nn.utils.weight_norm(start, name="weight") + start = torch.nn.utils.weight_norm(start, name='weight') self.start = start # Initializing last layer to 0 makes the affine coupling layers # do nothing at first. This helps with training stability - end = torch.nn.Conv1d(n_channels, 2 * n_in_channels, 1) + end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1) end.weight.data.zero_() end.bias.data.zero_() self.end = end + cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels*n_layers, 1) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') + for i in range(n_layers): dilation = 2 ** i - padding = int((kernel_size * dilation - dilation) / 2) - in_layer = torch.nn.Conv1d( - n_channels, - 2 * n_channels, - kernel_size, - dilation=dilation, - padding=padding, - ) - in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + padding = int((kernel_size*dilation - dilation)/2) + in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size, + dilation=dilation, padding=padding) + in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') self.in_layers.append(in_layer) - cond_layer = torch.nn.Conv1d(n_mel_channels, 2 * n_channels, 1) - cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") - self.cond_layers.append(cond_layer) # last one is not necessary if i < n_layers - 1: - res_skip_channels = 2 * n_channels + res_skip_channels = 2*n_channels else: res_skip_channels = n_channels res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.weight_norm( - res_skip_layer, name="weight" - ) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') self.res_skip_layers.append(res_skip_layer) def forward(self, forward_input): audio, spect = forward_input audio = self.start(audio) + output = torch.zeros_like(audio) + n_channels_tensor = torch.IntTensor([self.n_channels]) + + spect = self.cond_layer(spect) + for i in range(self.n_layers): + spect_offset = i*2*self.n_channels acts = fused_add_tanh_sigmoid_multiply( self.in_layers[i](audio), - self.cond_layers[i](spect), - torch.IntTensor([self.n_channels]), - ) + spect[:,spect_offset:spect_offset+2*self.n_channels,:], + n_channels_tensor) res_skip_acts = self.res_skip_layers[i](acts) if i < self.n_layers - 1: - audio = res_skip_acts[:, : self.n_channels, :] + audio - skip_acts = res_skip_acts[:, self.n_channels :, :] + audio = audio + res_skip_acts[:,:self.n_channels,:] + output = output + res_skip_acts[:,self.n_channels:,:] else: - skip_acts = res_skip_acts + output = output + res_skip_acts - if i == 0: - output = skip_acts - else: - output = skip_acts + output return self.end(output) class WaveGlow(torch.nn.Module): - def __init__( - self, - n_mel_channels, - n_flows, - n_group, - n_early_every, - n_early_size, - WN_config, - ): + def __init__(self, n_mel_channels, n_flows, n_group, n_early_every, + n_early_size, WN_config): super(WaveGlow, self).__init__() - self.upsample = torch.nn.ConvTranspose1d( - n_mel_channels, n_mel_channels, 1024, stride=256 - ) - assert n_group % 2 == 0 + self.upsample = torch.nn.ConvTranspose1d(n_mel_channels, + n_mel_channels, + 1024, stride=256) + assert(n_group % 2 == 0) self.n_flows = n_flows self.n_group = n_group self.n_early_every = n_early_every @@ -213,19 +191,18 @@ class WaveGlow(torch.nn.Module): self.WN = torch.nn.ModuleList() self.convinv = torch.nn.ModuleList() - n_half = int(n_group / 2) + n_half = int(n_group/2) # Set up layers with the right sizes based on how many dimensions # have been output already n_remaining_channels = n_group for k in range(n_flows): if k % self.n_early_every == 0 and k > 0: - n_half = n_half - int(self.n_early_size / 2) + n_half = n_half - int(self.n_early_size/2) n_remaining_channels = n_remaining_channels - self.n_early_size self.convinv.append(Invertible1x1Conv(n_remaining_channels)) - self.WN.append(WN(n_half, n_mel_channels * n_group, **WN_config)) - self.n_remaining_channels = n_remaining_channels - # Useful during inference + self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config)) + self.n_remaining_channels = n_remaining_channels # Useful during inference def forward(self, forward_input): """ @@ -236,16 +213,12 @@ class WaveGlow(torch.nn.Module): # Upsample spectrogram to size of audio spect = self.upsample(spect) - assert spect.size(2) >= audio.size(1) + assert(spect.size(2) >= audio.size(1)) if spect.size(2) > audio.size(1): - spect = spect[:, :, : audio.size(1)] + spect = spect[:, :, :audio.size(1)] spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) - spect = ( - spect.contiguous() - .view(spect.size(0), spect.size(1), -1) - .permute(0, 2, 1) - ) + spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1) output_audio = [] @@ -254,26 +227,26 @@ class WaveGlow(torch.nn.Module): for k in range(self.n_flows): if k % self.n_early_every == 0 and k > 0: - output_audio.append(audio[:, : self.n_early_size, :]) - audio = audio[:, self.n_early_size :, :] + output_audio.append(audio[:,:self.n_early_size,:]) + audio = audio[:,self.n_early_size:,:] audio, log_det_W = self.convinv[k](audio) log_det_W_list.append(log_det_W) - n_half = int(audio.size(1) / 2) - audio_0 = audio[:, :n_half, :] - audio_1 = audio[:, n_half:, :] + n_half = int(audio.size(1)/2) + audio_0 = audio[:,:n_half,:] + audio_1 = audio[:,n_half:,:] output = self.WN[k]((audio_0, spect)) log_s = output[:, n_half:, :] b = output[:, :n_half, :] - audio_1 = torch.exp(log_s) * audio_1 + b + audio_1 = torch.exp(log_s)*audio_1 + b log_s_list.append(log_s) - audio = torch.cat([audio_0, audio_1], 1) + audio = torch.cat([audio_0, audio_1],1) output_audio.append(audio) - return torch.cat(output_audio, 1), log_s_list, log_det_W_list + return torch.cat(output_audio,1), log_s_list, log_det_W_list def infer(self, spect, sigma=1.0): spect = self.upsample(spect) @@ -282,52 +255,41 @@ class WaveGlow(torch.nn.Module): spect = spect[:, :, :-time_cutoff] spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) - spect = ( - spect.contiguous() - .view(spect.size(0), spect.size(1), -1) - .permute(0, 2, 1) - ) + spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) - if spect.type() == "torch.cuda.HalfTensor": - audio = torch.cuda.HalfTensor( - spect.size(0), self.n_remaining_channels, spect.size(2) - ).normal_() + if spect.type() == 'torch.HalfTensor': + audio = torch.HalfTensor(spect.size(0), + self.n_remaining_channels, + spect.size(2)).normal_() else: - # cuda.FloatTensor -> FloatTensor - audio = torch.FloatTensor( - spect.size(0), self.n_remaining_channels, spect.size(2) - ).normal_() + audio = torch.FloatTensor(spect.size(0), + self.n_remaining_channels, + spect.size(2)).normal_() - audio = torch.autograd.Variable(sigma * audio) + audio = torch.autograd.Variable(sigma*audio) for k in reversed(range(self.n_flows)): - n_half = int(audio.size(1) / 2) - audio_0 = audio[:, :n_half, :] - audio_1 = audio[:, n_half:, :] + n_half = int(audio.size(1)/2) + audio_0 = audio[:,:n_half,:] + audio_1 = audio[:,n_half:,:] output = self.WN[k]((audio_0, spect)) + s = output[:, n_half:, :] b = output[:, :n_half, :] - audio_1 = (audio_1 - b) / torch.exp(s) - audio = torch.cat([audio_0, audio_1], 1) + audio_1 = (audio_1 - b)/torch.exp(s) + audio = torch.cat([audio_0, audio_1],1) audio = self.convinv[k](audio, reverse=True) if k % self.n_early_every == 0 and k > 0: - if spect.type() == "torch.cuda.HalfTensor": - z = torch.cuda.HalfTensor( - spect.size(0), self.n_early_size, spect.size(2) - ).normal_() + if spect.type() == 'torch.HalfTensor': + z = torch.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_() else: - # cuda.FloatTensor -> FloatTensor - z = torch.FloatTensor( - spect.size(0), self.n_early_size, spect.size(2) - ).normal_() - audio = torch.cat((sigma * z, audio), 1) + z = torch.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_() + audio = torch.cat((sigma*z, audio),1) - audio = ( - audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data - ) + audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data return audio @staticmethod @@ -336,7 +298,7 @@ class WaveGlow(torch.nn.Module): for WN in waveglow.WN: WN.start = torch.nn.utils.remove_weight_norm(WN.start) WN.in_layers = remove(WN.in_layers) - WN.cond_layers = remove(WN.cond_layers) + WN.cond_layer = torch.nn.utils.remove_weight_norm(WN.cond_layer) WN.res_skip_layers = remove(WN.res_skip_layers) return waveglow diff --git a/glow_old.py b/glow_old.py new file mode 100644 index 0000000..e6060c8 --- /dev/null +++ b/glow_old.py @@ -0,0 +1,349 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ***************************************************************************** +import torch +from torch.autograd import Variable +import torch.nn.functional as F + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.nn.functional.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.nn.functional.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +class WaveGlowLoss(torch.nn.Module): + def __init__(self, sigma=1.0): + super(WaveGlowLoss, self).__init__() + self.sigma = sigma + + def forward(self, model_output): + z, log_s_list, log_det_W_list = model_output + for i, log_s in enumerate(log_s_list): + if i == 0: + log_s_total = torch.sum(log_s) + log_det_W_total = log_det_W_list[i] + else: + log_s_total = log_s_total + torch.sum(log_s) + log_det_W_total += log_det_W_list[i] + + loss = ( + torch.sum(z * z) / (2 * self.sigma * self.sigma) + - log_s_total + - log_det_W_total + ) + return loss / (z.size(0) * z.size(1) * z.size(2)) + + +class Invertible1x1Conv(torch.nn.Module): + """ + The layer outputs both the convolution, and the log determinant + of its weight matrix. If reverse=True it does convolution with + inverse + """ + + def __init__(self, c): + super(Invertible1x1Conv, self).__init__() + self.conv = torch.nn.Conv1d( + c, c, kernel_size=1, stride=1, padding=0, bias=False + ) + + # Sample a random orthonormal matrix to initialize weights + W = torch.qr(torch.FloatTensor(c, c).normal_())[0] + + # Ensure determinant is 1.0 not -1.0 + if torch.det(W) < 0: + W[:, 0] = -1 * W[:, 0] + W = W.view(c, c, 1) + self.conv.weight.data = W + + def forward(self, z, reverse=False): + # shape + batch_size, group_size, n_of_groups = z.size() + + W = self.conv.weight.squeeze() + + if reverse: + if not hasattr(self, "W_inverse"): + # Reverse computation + W_inverse = W.inverse() + W_inverse = Variable(W_inverse[..., None]) + if z.type() == "torch.cuda.HalfTensor": + W_inverse = W_inverse.half() + self.W_inverse = W_inverse + z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) + return z + else: + # Forward computation + log_det_W = batch_size * n_of_groups * torch.logdet(W) + z = self.conv(z) + return z, log_det_W + + +class WN(torch.nn.Module): + """ + This is the WaveNet like layer for the affine coupling. The primary + difference from WaveNet is the convolutions need not be causal. There is + also no dilation size reset. The dilation only doubles on each layer + """ + + def __init__( + self, n_in_channels, n_mel_channels, n_layers, n_channels, kernel_size + ): + super(WN, self).__init__() + assert kernel_size % 2 == 1 + assert n_channels % 2 == 0 + self.n_layers = n_layers + self.n_channels = n_channels + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.cond_layers = torch.nn.ModuleList() + + start = torch.nn.Conv1d(n_in_channels, n_channels, 1) + start = torch.nn.utils.weight_norm(start, name="weight") + self.start = start + + # Initializing last layer to 0 makes the affine coupling layers + # do nothing at first. This helps with training stability + end = torch.nn.Conv1d(n_channels, 2 * n_in_channels, 1) + end.weight.data.zero_() + end.bias.data.zero_() + self.end = end + + for i in range(n_layers): + dilation = 2 ** i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d( + n_channels, + 2 * n_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + self.in_layers.append(in_layer) + + cond_layer = torch.nn.Conv1d(n_mel_channels, 2 * n_channels, 1) + cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + self.cond_layers.append(cond_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * n_channels + else: + res_skip_channels = n_channels + res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm( + res_skip_layer, name="weight" + ) + self.res_skip_layers.append(res_skip_layer) + + def forward(self, forward_input): + audio, spect = forward_input + audio = self.start(audio) + for i in range(self.n_layers): + acts = fused_add_tanh_sigmoid_multiply( + self.in_layers[i](audio), + self.cond_layers[i](spect), + torch.IntTensor([self.n_channels]), + ) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + audio = res_skip_acts[:, : self.n_channels, :] + audio + skip_acts = res_skip_acts[:, self.n_channels :, :] + else: + skip_acts = res_skip_acts + + if i == 0: + output = skip_acts + else: + output = skip_acts + output + return self.end(output) + + +class WaveGlow(torch.nn.Module): + def __init__( + self, + n_mel_channels, + n_flows, + n_group, + n_early_every, + n_early_size, + WN_config, + ): + super(WaveGlow, self).__init__() + + self.upsample = torch.nn.ConvTranspose1d( + n_mel_channels, n_mel_channels, 1024, stride=256 + ) + assert n_group % 2 == 0 + self.n_flows = n_flows + self.n_group = n_group + self.n_early_every = n_early_every + self.n_early_size = n_early_size + self.WN = torch.nn.ModuleList() + self.convinv = torch.nn.ModuleList() + + n_half = int(n_group / 2) + + # Set up layers with the right sizes based on how many dimensions + # have been output already + n_remaining_channels = n_group + for k in range(n_flows): + if k % self.n_early_every == 0 and k > 0: + n_half = n_half - int(self.n_early_size / 2) + n_remaining_channels = n_remaining_channels - self.n_early_size + self.convinv.append(Invertible1x1Conv(n_remaining_channels)) + self.WN.append(WN(n_half, n_mel_channels * n_group, **WN_config)) + self.n_remaining_channels = n_remaining_channels + # Useful during inference + + def forward(self, forward_input): + """ + forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames + forward_input[1] = audio: batch x time + """ + spect, audio = forward_input + + # Upsample spectrogram to size of audio + spect = self.upsample(spect) + assert spect.size(2) >= audio.size(1) + if spect.size(2) > audio.size(1): + spect = spect[:, :, : audio.size(1)] + + spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) + spect = ( + spect.contiguous() + .view(spect.size(0), spect.size(1), -1) + .permute(0, 2, 1) + ) + + audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1) + output_audio = [] + log_s_list = [] + log_det_W_list = [] + + for k in range(self.n_flows): + if k % self.n_early_every == 0 and k > 0: + output_audio.append(audio[:, : self.n_early_size, :]) + audio = audio[:, self.n_early_size :, :] + + audio, log_det_W = self.convinv[k](audio) + log_det_W_list.append(log_det_W) + + n_half = int(audio.size(1) / 2) + audio_0 = audio[:, :n_half, :] + audio_1 = audio[:, n_half:, :] + + output = self.WN[k]((audio_0, spect)) + log_s = output[:, n_half:, :] + b = output[:, :n_half, :] + audio_1 = torch.exp(log_s) * audio_1 + b + log_s_list.append(log_s) + + audio = torch.cat([audio_0, audio_1], 1) + + output_audio.append(audio) + return torch.cat(output_audio, 1), log_s_list, log_det_W_list + + def infer(self, spect, sigma=1.0): + spect = self.upsample(spect) + # trim conv artifacts. maybe pad spec to kernel multiple + time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0] + spect = spect[:, :, :-time_cutoff] + + spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) + spect = ( + spect.contiguous() + .view(spect.size(0), spect.size(1), -1) + .permute(0, 2, 1) + ) + + if spect.type() == "torch.cuda.HalfTensor": + audio = torch.cuda.HalfTensor( + spect.size(0), self.n_remaining_channels, spect.size(2) + ).normal_() + else: + # cuda.FloatTensor -> FloatTensor + audio = torch.FloatTensor( + spect.size(0), self.n_remaining_channels, spect.size(2) + ).normal_() + + audio = torch.autograd.Variable(sigma * audio) + + for k in reversed(range(self.n_flows)): + n_half = int(audio.size(1) / 2) + audio_0 = audio[:, :n_half, :] + audio_1 = audio[:, n_half:, :] + + output = self.WN[k]((audio_0, spect)) + s = output[:, n_half:, :] + b = output[:, :n_half, :] + audio_1 = (audio_1 - b) / torch.exp(s) + audio = torch.cat([audio_0, audio_1], 1) + + audio = self.convinv[k](audio, reverse=True) + + if k % self.n_early_every == 0 and k > 0: + if spect.type() == "torch.cuda.HalfTensor": + z = torch.cuda.HalfTensor( + spect.size(0), self.n_early_size, spect.size(2) + ).normal_() + else: + # cuda.FloatTensor -> FloatTensor + z = torch.FloatTensor( + spect.size(0), self.n_early_size, spect.size(2) + ).normal_() + audio = torch.cat((sigma * z, audio), 1) + + audio = ( + audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data + ) + return audio + + @staticmethod + def remove_weightnorm(model): + waveglow = model + for WN in waveglow.WN: + WN.start = torch.nn.utils.remove_weight_norm(WN.start) + WN.in_layers = remove(WN.in_layers) + WN.cond_layers = remove(WN.cond_layers) + WN.res_skip_layers = remove(WN.res_skip_layers) + return waveglow + + +def remove(conv_list): + new_conv_list = torch.nn.ModuleList() + for old_conv in conv_list: + old_conv = torch.nn.utils.remove_weight_norm(old_conv) + new_conv_list.append(old_conv) + return new_conv_list diff --git a/taco2/denoiser.py b/taco2/denoiser.py index 3d3f45d..de1836e 100644 --- a/taco2/denoiser.py +++ b/taco2/denoiser.py @@ -7,19 +7,19 @@ class Denoiser(torch.nn.Module): """ Removes model bias from audio produced with waveglow """ def __init__(self, waveglow, filter_length=1024, n_overlap=4, - win_length=1024, mode='zeros'): + win_length=1024, mode='zeros', n_mel_channels=80,): super(Denoiser, self).__init__() self.stft = STFT(filter_length=filter_length, hop_length=int(filter_length/n_overlap), win_length=win_length).cpu() if mode == 'zeros': mel_input = torch.zeros( - (1, 80, 88), + (1, n_mel_channels, 88), dtype=waveglow.upsample.weight.dtype, device=waveglow.upsample.weight.device) elif mode == 'normal': mel_input = torch.randn( - (1, 80, 88), + (1, n_mel_channels, 88), dtype=waveglow.upsample.weight.dtype, device=waveglow.upsample.weight.device) else: diff --git a/taco2/hparams.py b/taco2/hparams.py index 255f5e2..d123aea 100644 --- a/taco2/hparams.py +++ b/taco2/hparams.py @@ -2,76 +2,79 @@ # import tensorflow as tf from dataclasses import dataclass from .text import symbols + # from .text_codec import symbols + @dataclass class HParams(object): """docstring for HParams.""" + ################################ # Experiment Parameters # ################################ - epochs=500 - iters_per_checkpoint=1000 - seed=1234 - dynamic_loss_scaling=True - fp16_run=False - distributed_run=False - dist_backend="nccl" - dist_url="tcp://localhost:54321" - cudnn_enabled=True - cudnn_benchmark=False - ignore_layers=["embedding.weight"] + epochs = 500 + iters_per_checkpoint = 1000 + seed = 1234 + dynamic_loss_scaling = True + fp16_run = False + distributed_run = False + dist_backend = "nccl" + dist_url = "tcp://localhost:54321" + cudnn_enabled = True + cudnn_benchmark = False + ignore_layers = ["embedding.weight"] ################################ # Data Parameters # ################################ - load_mel_from_disk=False - training_files="lists/tts_data_train_processed.txt" - validation_files="filelists/tts_data_val_processed.txt" - text_cleaners=["english_cleaners"] + load_mel_from_disk = False + training_files = "lists/tts_data_train_processed.txt" + validation_files = "filelists/tts_data_val_processed.txt" + text_cleaners = ["english_cleaners"] ################################ # Audio Parameters # ################################ - max_wav_value=32768.0 - sampling_rate=16000 - filter_length=1024 - hop_length=256 - win_length=1024 - n_mel_channels=80 - mel_fmin=0.0 - mel_fmax=8000.0 + max_wav_value = 32768.0 + sampling_rate = 16000 + filter_length = 1024 + hop_length = 256 + win_length = 1024 + n_mel_channels: int = 40 + mel_fmin: float = 0.0 + mel_fmax: float = 4000.0 ################################ # Model Parameters # ################################ - n_symbols=len(symbols) - symbols_embedding_dim=512 + n_symbols = len(symbols) + symbols_embedding_dim = 512 # Encoder parameters - encoder_kernel_size=5 - encoder_n_convolutions=3 - encoder_embedding_dim=512 + encoder_kernel_size = 5 + encoder_n_convolutions = 3 + encoder_embedding_dim = 512 # Decoder parameters - n_frames_per_step=1 # currently only 1 is supported - decoder_rnn_dim=1024 - prenet_dim=256 - max_decoder_steps=1000 - gate_threshold=0.5 - p_attention_dropout=0.1 - p_decoder_dropout=0.1 + n_frames_per_step = 1 # currently only 1 is supported + decoder_rnn_dim = 1024 + prenet_dim = 256 + max_decoder_steps = 1000 + gate_threshold = 0.5 + p_attention_dropout = 0.1 + p_decoder_dropout = 0.1 # Attention parameters - attention_rnn_dim=1024 - attention_dim=128 + attention_rnn_dim = 1024 + attention_dim = 128 # Location Layer parameters - attention_location_n_filters=32 - attention_location_kernel_size=31 + attention_location_n_filters = 32 + attention_location_kernel_size = 31 # Mel-post processing network parameters - postnet_embedding_dim=512 - postnet_kernel_size=5 - postnet_n_convolutions=5 + postnet_embedding_dim = 512 + postnet_kernel_size = 5 + postnet_n_convolutions = 5 ################################ # Optimization Hyperparameters # ################################ - use_saved_learning_rate=False - learning_rate=1e-3 - weight_decay=1e-6 - grad_clip_thresh=1.0 - batch_size=4 - mask_padding=True # set model's padded outputs to padded values + use_saved_learning_rate = False + learning_rate = 1e-3 + weight_decay = 1e-6 + grad_clip_thresh = 1.0 + batch_size = 4 + mask_padding = True # set model's padded outputs to padded values