From 342b230b93d615db4bfae6c2811fa6eccd170696 Mon Sep 17 00:00:00 2001 From: Malar Date: Sat, 12 Oct 2019 14:40:58 +0530 Subject: [PATCH] compatibility to pretrained modesl --- glow.py | 216 ++++++++++++++++++++++--------------- glow_old.py => glow_new.py | 216 +++++++++++++++---------------------- taco2/hparams.py | 4 +- 3 files changed, 218 insertions(+), 218 deletions(-) rename glow_old.py => glow_new.py (59%) diff --git a/glow.py b/glow.py index 7c8e46c..e6060c8 100644 --- a/glow.py +++ b/glow.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # ***************************************************************************** # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # @@ -12,19 +13,18 @@ # names of its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # ***************************************************************************** -import copy import torch from torch.autograd import Variable import torch.nn.functional as F @@ -33,9 +33,9 @@ import torch.nn.functional as F @torch.jit.script def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): n_channels_int = n_channels[0] - in_act = input_a+input_b - t_act = torch.tanh(in_act[:, :n_channels_int, :]) - s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + in_act = input_a + input_b + t_act = torch.nn.functional.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.nn.functional.sigmoid(in_act[:, n_channels_int:, :]) acts = t_act * s_act return acts @@ -55,8 +55,12 @@ class WaveGlowLoss(torch.nn.Module): log_s_total = log_s_total + torch.sum(log_s) log_det_W_total += log_det_W_list[i] - loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total - return loss/(z.size(0)*z.size(1)*z.size(2)) + loss = ( + torch.sum(z * z) / (2 * self.sigma * self.sigma) + - log_s_total + - log_det_W_total + ) + return loss / (z.size(0) * z.size(1) * z.size(2)) class Invertible1x1Conv(torch.nn.Module): @@ -65,17 +69,19 @@ class Invertible1x1Conv(torch.nn.Module): of its weight matrix. If reverse=True it does convolution with inverse """ + def __init__(self, c): super(Invertible1x1Conv, self).__init__() - self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0, - bias=False) + self.conv = torch.nn.Conv1d( + c, c, kernel_size=1, stride=1, padding=0, bias=False + ) # Sample a random orthonormal matrix to initialize weights W = torch.qr(torch.FloatTensor(c, c).normal_())[0] # Ensure determinant is 1.0 not -1.0 if torch.det(W) < 0: - W[:,0] = -1*W[:,0] + W[:, 0] = -1 * W[:, 0] W = W.view(c, c, 1) self.conv.weight.data = W @@ -86,11 +92,11 @@ class Invertible1x1Conv(torch.nn.Module): W = self.conv.weight.squeeze() if reverse: - if not hasattr(self, 'W_inverse'): + if not hasattr(self, "W_inverse"): # Reverse computation - W_inverse = W.float().inverse() + W_inverse = W.inverse() W_inverse = Variable(W_inverse[..., None]) - if z.type() == 'torch.HalfTensor': + if z.type() == "torch.cuda.HalfTensor": W_inverse = W_inverse.half() self.W_inverse = W_inverse z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) @@ -104,86 +110,102 @@ class Invertible1x1Conv(torch.nn.Module): class WN(torch.nn.Module): """ - This is the WaveNet like layer for the affine coupling. The primary difference - from WaveNet is the convolutions need not be causal. There is also no dilation - size reset. The dilation only doubles on each layer + This is the WaveNet like layer for the affine coupling. The primary + difference from WaveNet is the convolutions need not be causal. There is + also no dilation size reset. The dilation only doubles on each layer """ - def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels, - kernel_size): + + def __init__( + self, n_in_channels, n_mel_channels, n_layers, n_channels, kernel_size + ): super(WN, self).__init__() - assert(kernel_size % 2 == 1) - assert(n_channels % 2 == 0) + assert kernel_size % 2 == 1 + assert n_channels % 2 == 0 self.n_layers = n_layers self.n_channels = n_channels self.in_layers = torch.nn.ModuleList() self.res_skip_layers = torch.nn.ModuleList() + self.cond_layers = torch.nn.ModuleList() start = torch.nn.Conv1d(n_in_channels, n_channels, 1) - start = torch.nn.utils.weight_norm(start, name='weight') + start = torch.nn.utils.weight_norm(start, name="weight") self.start = start # Initializing last layer to 0 makes the affine coupling layers # do nothing at first. This helps with training stability - end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1) + end = torch.nn.Conv1d(n_channels, 2 * n_in_channels, 1) end.weight.data.zero_() end.bias.data.zero_() self.end = end - cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels*n_layers, 1) - self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') - for i in range(n_layers): dilation = 2 ** i - padding = int((kernel_size*dilation - dilation)/2) - in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size, - dilation=dilation, padding=padding) - in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d( + n_channels, + 2 * n_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") self.in_layers.append(in_layer) + cond_layer = torch.nn.Conv1d(n_mel_channels, 2 * n_channels, 1) + cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + self.cond_layers.append(cond_layer) # last one is not necessary if i < n_layers - 1: - res_skip_channels = 2*n_channels + res_skip_channels = 2 * n_channels else: res_skip_channels = n_channels res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') + res_skip_layer = torch.nn.utils.weight_norm( + res_skip_layer, name="weight" + ) self.res_skip_layers.append(res_skip_layer) def forward(self, forward_input): audio, spect = forward_input audio = self.start(audio) - output = torch.zeros_like(audio) - n_channels_tensor = torch.IntTensor([self.n_channels]) - - spect = self.cond_layer(spect) - for i in range(self.n_layers): - spect_offset = i*2*self.n_channels acts = fused_add_tanh_sigmoid_multiply( self.in_layers[i](audio), - spect[:,spect_offset:spect_offset+2*self.n_channels,:], - n_channels_tensor) + self.cond_layers[i](spect), + torch.IntTensor([self.n_channels]), + ) res_skip_acts = self.res_skip_layers[i](acts) if i < self.n_layers - 1: - audio = audio + res_skip_acts[:,:self.n_channels,:] - output = output + res_skip_acts[:,self.n_channels:,:] + audio = res_skip_acts[:, : self.n_channels, :] + audio + skip_acts = res_skip_acts[:, self.n_channels :, :] else: - output = output + res_skip_acts + skip_acts = res_skip_acts + if i == 0: + output = skip_acts + else: + output = skip_acts + output return self.end(output) class WaveGlow(torch.nn.Module): - def __init__(self, n_mel_channels, n_flows, n_group, n_early_every, - n_early_size, WN_config): + def __init__( + self, + n_mel_channels, + n_flows, + n_group, + n_early_every, + n_early_size, + WN_config, + ): super(WaveGlow, self).__init__() - self.upsample = torch.nn.ConvTranspose1d(n_mel_channels, - n_mel_channels, - 1024, stride=256) - assert(n_group % 2 == 0) + self.upsample = torch.nn.ConvTranspose1d( + n_mel_channels, n_mel_channels, 1024, stride=256 + ) + assert n_group % 2 == 0 self.n_flows = n_flows self.n_group = n_group self.n_early_every = n_early_every @@ -191,18 +213,19 @@ class WaveGlow(torch.nn.Module): self.WN = torch.nn.ModuleList() self.convinv = torch.nn.ModuleList() - n_half = int(n_group/2) + n_half = int(n_group / 2) # Set up layers with the right sizes based on how many dimensions # have been output already n_remaining_channels = n_group for k in range(n_flows): if k % self.n_early_every == 0 and k > 0: - n_half = n_half - int(self.n_early_size/2) + n_half = n_half - int(self.n_early_size / 2) n_remaining_channels = n_remaining_channels - self.n_early_size self.convinv.append(Invertible1x1Conv(n_remaining_channels)) - self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config)) - self.n_remaining_channels = n_remaining_channels # Useful during inference + self.WN.append(WN(n_half, n_mel_channels * n_group, **WN_config)) + self.n_remaining_channels = n_remaining_channels + # Useful during inference def forward(self, forward_input): """ @@ -213,12 +236,16 @@ class WaveGlow(torch.nn.Module): # Upsample spectrogram to size of audio spect = self.upsample(spect) - assert(spect.size(2) >= audio.size(1)) + assert spect.size(2) >= audio.size(1) if spect.size(2) > audio.size(1): - spect = spect[:, :, :audio.size(1)] + spect = spect[:, :, : audio.size(1)] spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) - spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) + spect = ( + spect.contiguous() + .view(spect.size(0), spect.size(1), -1) + .permute(0, 2, 1) + ) audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1) output_audio = [] @@ -227,26 +254,26 @@ class WaveGlow(torch.nn.Module): for k in range(self.n_flows): if k % self.n_early_every == 0 and k > 0: - output_audio.append(audio[:,:self.n_early_size,:]) - audio = audio[:,self.n_early_size:,:] + output_audio.append(audio[:, : self.n_early_size, :]) + audio = audio[:, self.n_early_size :, :] audio, log_det_W = self.convinv[k](audio) log_det_W_list.append(log_det_W) - n_half = int(audio.size(1)/2) - audio_0 = audio[:,:n_half,:] - audio_1 = audio[:,n_half:,:] + n_half = int(audio.size(1) / 2) + audio_0 = audio[:, :n_half, :] + audio_1 = audio[:, n_half:, :] output = self.WN[k]((audio_0, spect)) log_s = output[:, n_half:, :] b = output[:, :n_half, :] - audio_1 = torch.exp(log_s)*audio_1 + b + audio_1 = torch.exp(log_s) * audio_1 + b log_s_list.append(log_s) - audio = torch.cat([audio_0, audio_1],1) + audio = torch.cat([audio_0, audio_1], 1) output_audio.append(audio) - return torch.cat(output_audio,1), log_s_list, log_det_W_list + return torch.cat(output_audio, 1), log_s_list, log_det_W_list def infer(self, spect, sigma=1.0): spect = self.upsample(spect) @@ -255,41 +282,52 @@ class WaveGlow(torch.nn.Module): spect = spect[:, :, :-time_cutoff] spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) - spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) + spect = ( + spect.contiguous() + .view(spect.size(0), spect.size(1), -1) + .permute(0, 2, 1) + ) - if spect.type() == 'torch.HalfTensor': - audio = torch.HalfTensor(spect.size(0), - self.n_remaining_channels, - spect.size(2)).normal_() + if spect.type() == "torch.cuda.HalfTensor": + audio = torch.cuda.HalfTensor( + spect.size(0), self.n_remaining_channels, spect.size(2) + ).normal_() else: - audio = torch.FloatTensor(spect.size(0), - self.n_remaining_channels, - spect.size(2)).normal_() + # cuda.FloatTensor -> FloatTensor + audio = torch.FloatTensor( + spect.size(0), self.n_remaining_channels, spect.size(2) + ).normal_() - audio = torch.autograd.Variable(sigma*audio) + audio = torch.autograd.Variable(sigma * audio) for k in reversed(range(self.n_flows)): - n_half = int(audio.size(1)/2) - audio_0 = audio[:,:n_half,:] - audio_1 = audio[:,n_half:,:] + n_half = int(audio.size(1) / 2) + audio_0 = audio[:, :n_half, :] + audio_1 = audio[:, n_half:, :] output = self.WN[k]((audio_0, spect)) - s = output[:, n_half:, :] b = output[:, :n_half, :] - audio_1 = (audio_1 - b)/torch.exp(s) - audio = torch.cat([audio_0, audio_1],1) + audio_1 = (audio_1 - b) / torch.exp(s) + audio = torch.cat([audio_0, audio_1], 1) audio = self.convinv[k](audio, reverse=True) if k % self.n_early_every == 0 and k > 0: - if spect.type() == 'torch.HalfTensor': - z = torch.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_() + if spect.type() == "torch.cuda.HalfTensor": + z = torch.cuda.HalfTensor( + spect.size(0), self.n_early_size, spect.size(2) + ).normal_() else: - z = torch.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_() - audio = torch.cat((sigma*z, audio),1) + # cuda.FloatTensor -> FloatTensor + z = torch.FloatTensor( + spect.size(0), self.n_early_size, spect.size(2) + ).normal_() + audio = torch.cat((sigma * z, audio), 1) - audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data + audio = ( + audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data + ) return audio @staticmethod @@ -298,7 +336,7 @@ class WaveGlow(torch.nn.Module): for WN in waveglow.WN: WN.start = torch.nn.utils.remove_weight_norm(WN.start) WN.in_layers = remove(WN.in_layers) - WN.cond_layer = torch.nn.utils.remove_weight_norm(WN.cond_layer) + WN.cond_layers = remove(WN.cond_layers) WN.res_skip_layers = remove(WN.res_skip_layers) return waveglow diff --git a/glow_old.py b/glow_new.py similarity index 59% rename from glow_old.py rename to glow_new.py index e6060c8..7c8e46c 100644 --- a/glow_old.py +++ b/glow_new.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # ***************************************************************************** # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # @@ -13,18 +12,19 @@ # names of its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF -# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # ***************************************************************************** +import copy import torch from torch.autograd import Variable import torch.nn.functional as F @@ -33,9 +33,9 @@ import torch.nn.functional as F @torch.jit.script def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): n_channels_int = n_channels[0] - in_act = input_a + input_b - t_act = torch.nn.functional.tanh(in_act[:, :n_channels_int, :]) - s_act = torch.nn.functional.sigmoid(in_act[:, n_channels_int:, :]) + in_act = input_a+input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) acts = t_act * s_act return acts @@ -55,12 +55,8 @@ class WaveGlowLoss(torch.nn.Module): log_s_total = log_s_total + torch.sum(log_s) log_det_W_total += log_det_W_list[i] - loss = ( - torch.sum(z * z) / (2 * self.sigma * self.sigma) - - log_s_total - - log_det_W_total - ) - return loss / (z.size(0) * z.size(1) * z.size(2)) + loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total + return loss/(z.size(0)*z.size(1)*z.size(2)) class Invertible1x1Conv(torch.nn.Module): @@ -69,19 +65,17 @@ class Invertible1x1Conv(torch.nn.Module): of its weight matrix. If reverse=True it does convolution with inverse """ - def __init__(self, c): super(Invertible1x1Conv, self).__init__() - self.conv = torch.nn.Conv1d( - c, c, kernel_size=1, stride=1, padding=0, bias=False - ) + self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0, + bias=False) # Sample a random orthonormal matrix to initialize weights W = torch.qr(torch.FloatTensor(c, c).normal_())[0] # Ensure determinant is 1.0 not -1.0 if torch.det(W) < 0: - W[:, 0] = -1 * W[:, 0] + W[:,0] = -1*W[:,0] W = W.view(c, c, 1) self.conv.weight.data = W @@ -92,11 +86,11 @@ class Invertible1x1Conv(torch.nn.Module): W = self.conv.weight.squeeze() if reverse: - if not hasattr(self, "W_inverse"): + if not hasattr(self, 'W_inverse'): # Reverse computation - W_inverse = W.inverse() + W_inverse = W.float().inverse() W_inverse = Variable(W_inverse[..., None]) - if z.type() == "torch.cuda.HalfTensor": + if z.type() == 'torch.HalfTensor': W_inverse = W_inverse.half() self.W_inverse = W_inverse z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) @@ -110,102 +104,86 @@ class Invertible1x1Conv(torch.nn.Module): class WN(torch.nn.Module): """ - This is the WaveNet like layer for the affine coupling. The primary - difference from WaveNet is the convolutions need not be causal. There is - also no dilation size reset. The dilation only doubles on each layer + This is the WaveNet like layer for the affine coupling. The primary difference + from WaveNet is the convolutions need not be causal. There is also no dilation + size reset. The dilation only doubles on each layer """ - - def __init__( - self, n_in_channels, n_mel_channels, n_layers, n_channels, kernel_size - ): + def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels, + kernel_size): super(WN, self).__init__() - assert kernel_size % 2 == 1 - assert n_channels % 2 == 0 + assert(kernel_size % 2 == 1) + assert(n_channels % 2 == 0) self.n_layers = n_layers self.n_channels = n_channels self.in_layers = torch.nn.ModuleList() self.res_skip_layers = torch.nn.ModuleList() - self.cond_layers = torch.nn.ModuleList() start = torch.nn.Conv1d(n_in_channels, n_channels, 1) - start = torch.nn.utils.weight_norm(start, name="weight") + start = torch.nn.utils.weight_norm(start, name='weight') self.start = start # Initializing last layer to 0 makes the affine coupling layers # do nothing at first. This helps with training stability - end = torch.nn.Conv1d(n_channels, 2 * n_in_channels, 1) + end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1) end.weight.data.zero_() end.bias.data.zero_() self.end = end + cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels*n_layers, 1) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') + for i in range(n_layers): dilation = 2 ** i - padding = int((kernel_size * dilation - dilation) / 2) - in_layer = torch.nn.Conv1d( - n_channels, - 2 * n_channels, - kernel_size, - dilation=dilation, - padding=padding, - ) - in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + padding = int((kernel_size*dilation - dilation)/2) + in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size, + dilation=dilation, padding=padding) + in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') self.in_layers.append(in_layer) - cond_layer = torch.nn.Conv1d(n_mel_channels, 2 * n_channels, 1) - cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") - self.cond_layers.append(cond_layer) # last one is not necessary if i < n_layers - 1: - res_skip_channels = 2 * n_channels + res_skip_channels = 2*n_channels else: res_skip_channels = n_channels res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.weight_norm( - res_skip_layer, name="weight" - ) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') self.res_skip_layers.append(res_skip_layer) def forward(self, forward_input): audio, spect = forward_input audio = self.start(audio) + output = torch.zeros_like(audio) + n_channels_tensor = torch.IntTensor([self.n_channels]) + + spect = self.cond_layer(spect) + for i in range(self.n_layers): + spect_offset = i*2*self.n_channels acts = fused_add_tanh_sigmoid_multiply( self.in_layers[i](audio), - self.cond_layers[i](spect), - torch.IntTensor([self.n_channels]), - ) + spect[:,spect_offset:spect_offset+2*self.n_channels,:], + n_channels_tensor) res_skip_acts = self.res_skip_layers[i](acts) if i < self.n_layers - 1: - audio = res_skip_acts[:, : self.n_channels, :] + audio - skip_acts = res_skip_acts[:, self.n_channels :, :] + audio = audio + res_skip_acts[:,:self.n_channels,:] + output = output + res_skip_acts[:,self.n_channels:,:] else: - skip_acts = res_skip_acts + output = output + res_skip_acts - if i == 0: - output = skip_acts - else: - output = skip_acts + output return self.end(output) class WaveGlow(torch.nn.Module): - def __init__( - self, - n_mel_channels, - n_flows, - n_group, - n_early_every, - n_early_size, - WN_config, - ): + def __init__(self, n_mel_channels, n_flows, n_group, n_early_every, + n_early_size, WN_config): super(WaveGlow, self).__init__() - self.upsample = torch.nn.ConvTranspose1d( - n_mel_channels, n_mel_channels, 1024, stride=256 - ) - assert n_group % 2 == 0 + self.upsample = torch.nn.ConvTranspose1d(n_mel_channels, + n_mel_channels, + 1024, stride=256) + assert(n_group % 2 == 0) self.n_flows = n_flows self.n_group = n_group self.n_early_every = n_early_every @@ -213,19 +191,18 @@ class WaveGlow(torch.nn.Module): self.WN = torch.nn.ModuleList() self.convinv = torch.nn.ModuleList() - n_half = int(n_group / 2) + n_half = int(n_group/2) # Set up layers with the right sizes based on how many dimensions # have been output already n_remaining_channels = n_group for k in range(n_flows): if k % self.n_early_every == 0 and k > 0: - n_half = n_half - int(self.n_early_size / 2) + n_half = n_half - int(self.n_early_size/2) n_remaining_channels = n_remaining_channels - self.n_early_size self.convinv.append(Invertible1x1Conv(n_remaining_channels)) - self.WN.append(WN(n_half, n_mel_channels * n_group, **WN_config)) - self.n_remaining_channels = n_remaining_channels - # Useful during inference + self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config)) + self.n_remaining_channels = n_remaining_channels # Useful during inference def forward(self, forward_input): """ @@ -236,16 +213,12 @@ class WaveGlow(torch.nn.Module): # Upsample spectrogram to size of audio spect = self.upsample(spect) - assert spect.size(2) >= audio.size(1) + assert(spect.size(2) >= audio.size(1)) if spect.size(2) > audio.size(1): - spect = spect[:, :, : audio.size(1)] + spect = spect[:, :, :audio.size(1)] spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) - spect = ( - spect.contiguous() - .view(spect.size(0), spect.size(1), -1) - .permute(0, 2, 1) - ) + spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1) output_audio = [] @@ -254,26 +227,26 @@ class WaveGlow(torch.nn.Module): for k in range(self.n_flows): if k % self.n_early_every == 0 and k > 0: - output_audio.append(audio[:, : self.n_early_size, :]) - audio = audio[:, self.n_early_size :, :] + output_audio.append(audio[:,:self.n_early_size,:]) + audio = audio[:,self.n_early_size:,:] audio, log_det_W = self.convinv[k](audio) log_det_W_list.append(log_det_W) - n_half = int(audio.size(1) / 2) - audio_0 = audio[:, :n_half, :] - audio_1 = audio[:, n_half:, :] + n_half = int(audio.size(1)/2) + audio_0 = audio[:,:n_half,:] + audio_1 = audio[:,n_half:,:] output = self.WN[k]((audio_0, spect)) log_s = output[:, n_half:, :] b = output[:, :n_half, :] - audio_1 = torch.exp(log_s) * audio_1 + b + audio_1 = torch.exp(log_s)*audio_1 + b log_s_list.append(log_s) - audio = torch.cat([audio_0, audio_1], 1) + audio = torch.cat([audio_0, audio_1],1) output_audio.append(audio) - return torch.cat(output_audio, 1), log_s_list, log_det_W_list + return torch.cat(output_audio,1), log_s_list, log_det_W_list def infer(self, spect, sigma=1.0): spect = self.upsample(spect) @@ -282,52 +255,41 @@ class WaveGlow(torch.nn.Module): spect = spect[:, :, :-time_cutoff] spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) - spect = ( - spect.contiguous() - .view(spect.size(0), spect.size(1), -1) - .permute(0, 2, 1) - ) + spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) - if spect.type() == "torch.cuda.HalfTensor": - audio = torch.cuda.HalfTensor( - spect.size(0), self.n_remaining_channels, spect.size(2) - ).normal_() + if spect.type() == 'torch.HalfTensor': + audio = torch.HalfTensor(spect.size(0), + self.n_remaining_channels, + spect.size(2)).normal_() else: - # cuda.FloatTensor -> FloatTensor - audio = torch.FloatTensor( - spect.size(0), self.n_remaining_channels, spect.size(2) - ).normal_() + audio = torch.FloatTensor(spect.size(0), + self.n_remaining_channels, + spect.size(2)).normal_() - audio = torch.autograd.Variable(sigma * audio) + audio = torch.autograd.Variable(sigma*audio) for k in reversed(range(self.n_flows)): - n_half = int(audio.size(1) / 2) - audio_0 = audio[:, :n_half, :] - audio_1 = audio[:, n_half:, :] + n_half = int(audio.size(1)/2) + audio_0 = audio[:,:n_half,:] + audio_1 = audio[:,n_half:,:] output = self.WN[k]((audio_0, spect)) + s = output[:, n_half:, :] b = output[:, :n_half, :] - audio_1 = (audio_1 - b) / torch.exp(s) - audio = torch.cat([audio_0, audio_1], 1) + audio_1 = (audio_1 - b)/torch.exp(s) + audio = torch.cat([audio_0, audio_1],1) audio = self.convinv[k](audio, reverse=True) if k % self.n_early_every == 0 and k > 0: - if spect.type() == "torch.cuda.HalfTensor": - z = torch.cuda.HalfTensor( - spect.size(0), self.n_early_size, spect.size(2) - ).normal_() + if spect.type() == 'torch.HalfTensor': + z = torch.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_() else: - # cuda.FloatTensor -> FloatTensor - z = torch.FloatTensor( - spect.size(0), self.n_early_size, spect.size(2) - ).normal_() - audio = torch.cat((sigma * z, audio), 1) + z = torch.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_() + audio = torch.cat((sigma*z, audio),1) - audio = ( - audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data - ) + audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data return audio @staticmethod @@ -336,7 +298,7 @@ class WaveGlow(torch.nn.Module): for WN in waveglow.WN: WN.start = torch.nn.utils.remove_weight_norm(WN.start) WN.in_layers = remove(WN.in_layers) - WN.cond_layers = remove(WN.cond_layers) + WN.cond_layer = torch.nn.utils.remove_weight_norm(WN.cond_layer) WN.res_skip_layers = remove(WN.res_skip_layers) return waveglow diff --git a/taco2/hparams.py b/taco2/hparams.py index d123aea..6c632dc 100644 --- a/taco2/hparams.py +++ b/taco2/hparams.py @@ -39,9 +39,9 @@ class HParams(object): filter_length = 1024 hop_length = 256 win_length = 1024 - n_mel_channels: int = 40 + n_mel_channels: int = 80 mel_fmin: float = 0.0 - mel_fmax: float = 4000.0 + mel_fmax: float = 8000.0 ################################ # Model Parameters # ################################