1. update waveglow

2. add gl option and hyperparams to TTSModel
2026-03-07 17:32:33 +00:00 · 2019-10-04 15:24:42 +05:30
parent d0d273a698
commit 36c731cad0
4 changed files with 492 additions and 178 deletions
--- a/glow.py
+++ b/glow.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 # *****************************************************************************
 #  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
@@ -13,18 +12,19 @@
 #        names of its contributors may be used to endorse or promote products
 #        derived from this software without specific prior written permission.
 #
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-#  ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 #  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 #  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 #  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 #  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
-#  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 # *****************************************************************************
+import copy
 import torch
 from torch.autograd import Variable
 import torch.nn.functional as F
@@ -33,9 +33,9 @@ import torch.nn.functional as F
@torch.jit.script
 def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
    n_channels_int = n_channels[0]
-    in_act = input_a + input_b
-    t_act = torch.nn.functional.tanh(in_act[:, :n_channels_int, :])
-    s_act = torch.nn.functional.sigmoid(in_act[:, n_channels_int:, :])
+    in_act = input_a+input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
    acts = t_act * s_act
    return acts

@@ -55,12 +55,8 @@ class WaveGlowLoss(torch.nn.Module):
                log_s_total = log_s_total + torch.sum(log_s)
                log_det_W_total += log_det_W_list[i]

-        loss = (
-            torch.sum(z * z) / (2 * self.sigma * self.sigma)
-            - log_s_total
-            - log_det_W_total
-        )
-        return loss / (z.size(0) * z.size(1) * z.size(2))
+        loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total
+        return loss/(z.size(0)*z.size(1)*z.size(2))


 class Invertible1x1Conv(torch.nn.Module):
@@ -69,19 +65,17 @@ class Invertible1x1Conv(torch.nn.Module):
    of its weight matrix.  If reverse=True it does convolution with
    inverse
    """
-
    def __init__(self, c):
        super(Invertible1x1Conv, self).__init__()
-        self.conv = torch.nn.Conv1d(
-            c, c, kernel_size=1, stride=1, padding=0, bias=False
-        )
+        self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0,
+                                    bias=False)

        # Sample a random orthonormal matrix to initialize weights
        W = torch.qr(torch.FloatTensor(c, c).normal_())[0]

        # Ensure determinant is 1.0 not -1.0
        if torch.det(W) < 0:
-            W[:, 0] = -1 * W[:, 0]
+            W[:,0] = -1*W[:,0]
        W = W.view(c, c, 1)
        self.conv.weight.data = W

@@ -92,11 +86,11 @@ class Invertible1x1Conv(torch.nn.Module):
        W = self.conv.weight.squeeze()

        if reverse:
-            if not hasattr(self, "W_inverse"):
+            if not hasattr(self, 'W_inverse'):
                # Reverse computation
-                W_inverse = W.inverse()
+                W_inverse = W.float().inverse()
                W_inverse = Variable(W_inverse[..., None])
-                if z.type() == "torch.cuda.HalfTensor":
+                if z.type() == 'torch.HalfTensor':
                    W_inverse = W_inverse.half()
                self.W_inverse = W_inverse
            z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
@@ -110,102 +104,86 @@ class Invertible1x1Conv(torch.nn.Module):

 class WN(torch.nn.Module):
    """
-    This is the WaveNet like layer for the affine coupling.  The primary
-    difference from WaveNet is the convolutions need not be causal.  There is
-    also no dilation size reset.  The dilation only doubles on each layer
+    This is the WaveNet like layer for the affine coupling.  The primary difference
+    from WaveNet is the convolutions need not be causal.  There is also no dilation
+    size reset.  The dilation only doubles on each layer
    """
-
-    def __init__(
-        self, n_in_channels, n_mel_channels, n_layers, n_channels, kernel_size
-    ):
+    def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
+                 kernel_size):
        super(WN, self).__init__()
-        assert kernel_size % 2 == 1
-        assert n_channels % 2 == 0
+        assert(kernel_size % 2 == 1)
+        assert(n_channels % 2 == 0)
        self.n_layers = n_layers
        self.n_channels = n_channels
        self.in_layers = torch.nn.ModuleList()
        self.res_skip_layers = torch.nn.ModuleList()
-        self.cond_layers = torch.nn.ModuleList()

        start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
-        start = torch.nn.utils.weight_norm(start, name="weight")
+        start = torch.nn.utils.weight_norm(start, name='weight')
        self.start = start

        # Initializing last layer to 0 makes the affine coupling layers
        # do nothing at first.  This helps with training stability
-        end = torch.nn.Conv1d(n_channels, 2 * n_in_channels, 1)
+        end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1)
        end.weight.data.zero_()
        end.bias.data.zero_()
        self.end = end

+        cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels*n_layers, 1)
+        self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+
        for i in range(n_layers):
            dilation = 2 ** i
-            padding = int((kernel_size * dilation - dilation) / 2)
-            in_layer = torch.nn.Conv1d(
-                n_channels,
-                2 * n_channels,
-                kernel_size,
-                dilation=dilation,
-                padding=padding,
-            )
-            in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
+            padding = int((kernel_size*dilation - dilation)/2)
+            in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size,
+                                       dilation=dilation, padding=padding)
+            in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
            self.in_layers.append(in_layer)

-            cond_layer = torch.nn.Conv1d(n_mel_channels, 2 * n_channels, 1)
-            cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
-            self.cond_layers.append(cond_layer)

            # last one is not necessary
            if i < n_layers - 1:
-                res_skip_channels = 2 * n_channels
+                res_skip_channels = 2*n_channels
            else:
                res_skip_channels = n_channels
            res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
-            res_skip_layer = torch.nn.utils.weight_norm(
-                res_skip_layer, name="weight"
-            )
+            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
            self.res_skip_layers.append(res_skip_layer)

    def forward(self, forward_input):
        audio, spect = forward_input
        audio = self.start(audio)
+        output = torch.zeros_like(audio)
+        n_channels_tensor = torch.IntTensor([self.n_channels])
+
+        spect = self.cond_layer(spect)
+
        for i in range(self.n_layers):
+            spect_offset = i*2*self.n_channels
            acts = fused_add_tanh_sigmoid_multiply(
                self.in_layers[i](audio),
-                self.cond_layers[i](spect),
-                torch.IntTensor([self.n_channels]),
-            )
+                spect[:,spect_offset:spect_offset+2*self.n_channels,:],
+                n_channels_tensor)

            res_skip_acts = self.res_skip_layers[i](acts)
            if i < self.n_layers - 1:
-                audio = res_skip_acts[:, : self.n_channels, :] + audio
-                skip_acts = res_skip_acts[:, self.n_channels :, :]
+                audio = audio + res_skip_acts[:,:self.n_channels,:]
+                output = output + res_skip_acts[:,self.n_channels:,:]
            else:
-                skip_acts = res_skip_acts
+                output = output + res_skip_acts

-            if i == 0:
-                output = skip_acts
-            else:
-                output = skip_acts + output
        return self.end(output)


 class WaveGlow(torch.nn.Module):
-    def __init__(
-        self,
-        n_mel_channels,
-        n_flows,
-        n_group,
-        n_early_every,
-        n_early_size,
-        WN_config,
-    ):
+    def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
+                 n_early_size, WN_config):
        super(WaveGlow, self).__init__()

-        self.upsample = torch.nn.ConvTranspose1d(
-            n_mel_channels, n_mel_channels, 1024, stride=256
-        )
-        assert n_group % 2 == 0
+        self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
+                                                 n_mel_channels,
+                                                 1024, stride=256)
+        assert(n_group % 2 == 0)
        self.n_flows = n_flows
        self.n_group = n_group
        self.n_early_every = n_early_every
@@ -213,19 +191,18 @@ class WaveGlow(torch.nn.Module):
        self.WN = torch.nn.ModuleList()
        self.convinv = torch.nn.ModuleList()

-        n_half = int(n_group / 2)
+        n_half = int(n_group/2)

        # Set up layers with the right sizes based on how many dimensions
        # have been output already
        n_remaining_channels = n_group
        for k in range(n_flows):
            if k % self.n_early_every == 0 and k > 0:
-                n_half = n_half - int(self.n_early_size / 2)
+                n_half = n_half - int(self.n_early_size/2)
                n_remaining_channels = n_remaining_channels - self.n_early_size
            self.convinv.append(Invertible1x1Conv(n_remaining_channels))
-            self.WN.append(WN(n_half, n_mel_channels * n_group, **WN_config))
-        self.n_remaining_channels = n_remaining_channels
-        # Useful during inference
+            self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config))
+        self.n_remaining_channels = n_remaining_channels  # Useful during inference

    def forward(self, forward_input):
        """
@@ -236,16 +213,12 @@ class WaveGlow(torch.nn.Module):

        #  Upsample spectrogram to size of audio
        spect = self.upsample(spect)
-        assert spect.size(2) >= audio.size(1)
+        assert(spect.size(2) >= audio.size(1))
        if spect.size(2) > audio.size(1):
-            spect = spect[:, :, : audio.size(1)]
+            spect = spect[:, :, :audio.size(1)]

        spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
-        spect = (
-            spect.contiguous()
-            .view(spect.size(0), spect.size(1), -1)
-            .permute(0, 2, 1)
-        )
+        spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)

        audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
        output_audio = []
@@ -254,26 +227,26 @@ class WaveGlow(torch.nn.Module):

        for k in range(self.n_flows):
            if k % self.n_early_every == 0 and k > 0:
-                output_audio.append(audio[:, : self.n_early_size, :])
-                audio = audio[:, self.n_early_size :, :]
+                output_audio.append(audio[:,:self.n_early_size,:])
+                audio = audio[:,self.n_early_size:,:]

            audio, log_det_W = self.convinv[k](audio)
            log_det_W_list.append(log_det_W)

-            n_half = int(audio.size(1) / 2)
-            audio_0 = audio[:, :n_half, :]
-            audio_1 = audio[:, n_half:, :]
+            n_half = int(audio.size(1)/2)
+            audio_0 = audio[:,:n_half,:]
+            audio_1 = audio[:,n_half:,:]

            output = self.WN[k]((audio_0, spect))
            log_s = output[:, n_half:, :]
            b = output[:, :n_half, :]
-            audio_1 = torch.exp(log_s) * audio_1 + b
+            audio_1 = torch.exp(log_s)*audio_1 + b
            log_s_list.append(log_s)

-            audio = torch.cat([audio_0, audio_1], 1)
+            audio = torch.cat([audio_0, audio_1],1)

        output_audio.append(audio)
-        return torch.cat(output_audio, 1), log_s_list, log_det_W_list
+        return torch.cat(output_audio,1), log_s_list, log_det_W_list

    def infer(self, spect, sigma=1.0):
        spect = self.upsample(spect)
@@ -282,52 +255,41 @@ class WaveGlow(torch.nn.Module):
        spect = spect[:, :, :-time_cutoff]

        spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
-        spect = (
-            spect.contiguous()
-            .view(spect.size(0), spect.size(1), -1)
-            .permute(0, 2, 1)
-        )
+        spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)

-        if spect.type() == "torch.cuda.HalfTensor":
-            audio = torch.cuda.HalfTensor(
-                spect.size(0), self.n_remaining_channels, spect.size(2)
-            ).normal_()
+        if spect.type() == 'torch.HalfTensor':
+            audio = torch.HalfTensor(spect.size(0),
+                                          self.n_remaining_channels,
+                                          spect.size(2)).normal_()
        else:
-            # cuda.FloatTensor -> FloatTensor
-            audio = torch.FloatTensor(
-                spect.size(0), self.n_remaining_channels, spect.size(2)
-            ).normal_()
+            audio = torch.FloatTensor(spect.size(0),
+                                           self.n_remaining_channels,
+                                           spect.size(2)).normal_()

-        audio = torch.autograd.Variable(sigma * audio)
+        audio = torch.autograd.Variable(sigma*audio)

        for k in reversed(range(self.n_flows)):
-            n_half = int(audio.size(1) / 2)
-            audio_0 = audio[:, :n_half, :]
-            audio_1 = audio[:, n_half:, :]
+            n_half = int(audio.size(1)/2)
+            audio_0 = audio[:,:n_half,:]
+            audio_1 = audio[:,n_half:,:]

            output = self.WN[k]((audio_0, spect))
+
            s = output[:, n_half:, :]
            b = output[:, :n_half, :]
-            audio_1 = (audio_1 - b) / torch.exp(s)
-            audio = torch.cat([audio_0, audio_1], 1)
+            audio_1 = (audio_1 - b)/torch.exp(s)
+            audio = torch.cat([audio_0, audio_1],1)

            audio = self.convinv[k](audio, reverse=True)

            if k % self.n_early_every == 0 and k > 0:
-                if spect.type() == "torch.cuda.HalfTensor":
-                    z = torch.cuda.HalfTensor(
-                        spect.size(0), self.n_early_size, spect.size(2)
-                    ).normal_()
+                if spect.type() == 'torch.HalfTensor':
+                    z = torch.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
                else:
-                    # cuda.FloatTensor -> FloatTensor
-                    z = torch.FloatTensor(
-                        spect.size(0), self.n_early_size, spect.size(2)
-                    ).normal_()
-                audio = torch.cat((sigma * z, audio), 1)
+                    z = torch.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
+                audio = torch.cat((sigma*z, audio),1)

-        audio = (
-            audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data
-        )
+        audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
        return audio

    @staticmethod
@@ -336,7 +298,7 @@ class WaveGlow(torch.nn.Module):
        for WN in waveglow.WN:
            WN.start = torch.nn.utils.remove_weight_norm(WN.start)
            WN.in_layers = remove(WN.in_layers)
-            WN.cond_layers = remove(WN.cond_layers)
+            WN.cond_layer = torch.nn.utils.remove_weight_norm(WN.cond_layer)
            WN.res_skip_layers = remove(WN.res_skip_layers)
        return waveglow

--- a/glow_old.py
+++ b/glow_old.py
@@ -0,0 +1,349 @@
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+#  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+import torch
+from torch.autograd import Variable
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.nn.functional.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.nn.functional.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+
+
+class WaveGlowLoss(torch.nn.Module):
+    def __init__(self, sigma=1.0):
+        super(WaveGlowLoss, self).__init__()
+        self.sigma = sigma
+
+    def forward(self, model_output):
+        z, log_s_list, log_det_W_list = model_output
+        for i, log_s in enumerate(log_s_list):
+            if i == 0:
+                log_s_total = torch.sum(log_s)
+                log_det_W_total = log_det_W_list[i]
+            else:
+                log_s_total = log_s_total + torch.sum(log_s)
+                log_det_W_total += log_det_W_list[i]
+
+        loss = (
+            torch.sum(z * z) / (2 * self.sigma * self.sigma)
+            - log_s_total
+            - log_det_W_total
+        )
+        return loss / (z.size(0) * z.size(1) * z.size(2))
+
+
+class Invertible1x1Conv(torch.nn.Module):
+    """
+    The layer outputs both the convolution, and the log determinant
+    of its weight matrix.  If reverse=True it does convolution with
+    inverse
+    """
+
+    def __init__(self, c):
+        super(Invertible1x1Conv, self).__init__()
+        self.conv = torch.nn.Conv1d(
+            c, c, kernel_size=1, stride=1, padding=0, bias=False
+        )
+
+        # Sample a random orthonormal matrix to initialize weights
+        W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
+
+        # Ensure determinant is 1.0 not -1.0
+        if torch.det(W) < 0:
+            W[:, 0] = -1 * W[:, 0]
+        W = W.view(c, c, 1)
+        self.conv.weight.data = W
+
+    def forward(self, z, reverse=False):
+        # shape
+        batch_size, group_size, n_of_groups = z.size()
+
+        W = self.conv.weight.squeeze()
+
+        if reverse:
+            if not hasattr(self, "W_inverse"):
+                # Reverse computation
+                W_inverse = W.inverse()
+                W_inverse = Variable(W_inverse[..., None])
+                if z.type() == "torch.cuda.HalfTensor":
+                    W_inverse = W_inverse.half()
+                self.W_inverse = W_inverse
+            z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
+            return z
+        else:
+            # Forward computation
+            log_det_W = batch_size * n_of_groups * torch.logdet(W)
+            z = self.conv(z)
+            return z, log_det_W
+
+
+class WN(torch.nn.Module):
+    """
+    This is the WaveNet like layer for the affine coupling.  The primary
+    difference from WaveNet is the convolutions need not be causal.  There is
+    also no dilation size reset.  The dilation only doubles on each layer
+    """
+
+    def __init__(
+        self, n_in_channels, n_mel_channels, n_layers, n_channels, kernel_size
+    ):
+        super(WN, self).__init__()
+        assert kernel_size % 2 == 1
+        assert n_channels % 2 == 0
+        self.n_layers = n_layers
+        self.n_channels = n_channels
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.cond_layers = torch.nn.ModuleList()
+
+        start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
+        start = torch.nn.utils.weight_norm(start, name="weight")
+        self.start = start
+
+        # Initializing last layer to 0 makes the affine coupling layers
+        # do nothing at first.  This helps with training stability
+        end = torch.nn.Conv1d(n_channels, 2 * n_in_channels, 1)
+        end.weight.data.zero_()
+        end.bias.data.zero_()
+        self.end = end
+
+        for i in range(n_layers):
+            dilation = 2 ** i
+            padding = int((kernel_size * dilation - dilation) / 2)
+            in_layer = torch.nn.Conv1d(
+                n_channels,
+                2 * n_channels,
+                kernel_size,
+                dilation=dilation,
+                padding=padding,
+            )
+            in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
+            self.in_layers.append(in_layer)
+
+            cond_layer = torch.nn.Conv1d(n_mel_channels, 2 * n_channels, 1)
+            cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
+            self.cond_layers.append(cond_layer)
+
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2 * n_channels
+            else:
+                res_skip_channels = n_channels
+            res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
+            res_skip_layer = torch.nn.utils.weight_norm(
+                res_skip_layer, name="weight"
+            )
+            self.res_skip_layers.append(res_skip_layer)
+
+    def forward(self, forward_input):
+        audio, spect = forward_input
+        audio = self.start(audio)
+        for i in range(self.n_layers):
+            acts = fused_add_tanh_sigmoid_multiply(
+                self.in_layers[i](audio),
+                self.cond_layers[i](spect),
+                torch.IntTensor([self.n_channels]),
+            )
+
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                audio = res_skip_acts[:, : self.n_channels, :] + audio
+                skip_acts = res_skip_acts[:, self.n_channels :, :]
+            else:
+                skip_acts = res_skip_acts
+
+            if i == 0:
+                output = skip_acts
+            else:
+                output = skip_acts + output
+        return self.end(output)
+
+
+class WaveGlow(torch.nn.Module):
+    def __init__(
+        self,
+        n_mel_channels,
+        n_flows,
+        n_group,
+        n_early_every,
+        n_early_size,
+        WN_config,
+    ):
+        super(WaveGlow, self).__init__()
+
+        self.upsample = torch.nn.ConvTranspose1d(
+            n_mel_channels, n_mel_channels, 1024, stride=256
+        )
+        assert n_group % 2 == 0
+        self.n_flows = n_flows
+        self.n_group = n_group
+        self.n_early_every = n_early_every
+        self.n_early_size = n_early_size
+        self.WN = torch.nn.ModuleList()
+        self.convinv = torch.nn.ModuleList()
+
+        n_half = int(n_group / 2)
+
+        # Set up layers with the right sizes based on how many dimensions
+        # have been output already
+        n_remaining_channels = n_group
+        for k in range(n_flows):
+            if k % self.n_early_every == 0 and k > 0:
+                n_half = n_half - int(self.n_early_size / 2)
+                n_remaining_channels = n_remaining_channels - self.n_early_size
+            self.convinv.append(Invertible1x1Conv(n_remaining_channels))
+            self.WN.append(WN(n_half, n_mel_channels * n_group, **WN_config))
+        self.n_remaining_channels = n_remaining_channels
+        # Useful during inference
+
+    def forward(self, forward_input):
+        """
+        forward_input[0] = mel_spectrogram:  batch x n_mel_channels x frames
+        forward_input[1] = audio: batch x time
+        """
+        spect, audio = forward_input
+
+        #  Upsample spectrogram to size of audio
+        spect = self.upsample(spect)
+        assert spect.size(2) >= audio.size(1)
+        if spect.size(2) > audio.size(1):
+            spect = spect[:, :, : audio.size(1)]
+
+        spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
+        spect = (
+            spect.contiguous()
+            .view(spect.size(0), spect.size(1), -1)
+            .permute(0, 2, 1)
+        )
+
+        audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
+        output_audio = []
+        log_s_list = []
+        log_det_W_list = []
+
+        for k in range(self.n_flows):
+            if k % self.n_early_every == 0 and k > 0:
+                output_audio.append(audio[:, : self.n_early_size, :])
+                audio = audio[:, self.n_early_size :, :]
+
+            audio, log_det_W = self.convinv[k](audio)
+            log_det_W_list.append(log_det_W)
+
+            n_half = int(audio.size(1) / 2)
+            audio_0 = audio[:, :n_half, :]
+            audio_1 = audio[:, n_half:, :]
+
+            output = self.WN[k]((audio_0, spect))
+            log_s = output[:, n_half:, :]
+            b = output[:, :n_half, :]
+            audio_1 = torch.exp(log_s) * audio_1 + b
+            log_s_list.append(log_s)
+
+            audio = torch.cat([audio_0, audio_1], 1)
+
+        output_audio.append(audio)
+        return torch.cat(output_audio, 1), log_s_list, log_det_W_list
+
+    def infer(self, spect, sigma=1.0):
+        spect = self.upsample(spect)
+        # trim conv artifacts. maybe pad spec to kernel multiple
+        time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
+        spect = spect[:, :, :-time_cutoff]
+
+        spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
+        spect = (
+            spect.contiguous()
+            .view(spect.size(0), spect.size(1), -1)
+            .permute(0, 2, 1)
+        )
+
+        if spect.type() == "torch.cuda.HalfTensor":
+            audio = torch.cuda.HalfTensor(
+                spect.size(0), self.n_remaining_channels, spect.size(2)
+            ).normal_()
+        else:
+            # cuda.FloatTensor -> FloatTensor
+            audio = torch.FloatTensor(
+                spect.size(0), self.n_remaining_channels, spect.size(2)
+            ).normal_()
+
+        audio = torch.autograd.Variable(sigma * audio)
+
+        for k in reversed(range(self.n_flows)):
+            n_half = int(audio.size(1) / 2)
+            audio_0 = audio[:, :n_half, :]
+            audio_1 = audio[:, n_half:, :]
+
+            output = self.WN[k]((audio_0, spect))
+            s = output[:, n_half:, :]
+            b = output[:, :n_half, :]
+            audio_1 = (audio_1 - b) / torch.exp(s)
+            audio = torch.cat([audio_0, audio_1], 1)
+
+            audio = self.convinv[k](audio, reverse=True)
+
+            if k % self.n_early_every == 0 and k > 0:
+                if spect.type() == "torch.cuda.HalfTensor":
+                    z = torch.cuda.HalfTensor(
+                        spect.size(0), self.n_early_size, spect.size(2)
+                    ).normal_()
+                else:
+                    # cuda.FloatTensor -> FloatTensor
+                    z = torch.FloatTensor(
+                        spect.size(0), self.n_early_size, spect.size(2)
+                    ).normal_()
+                audio = torch.cat((sigma * z, audio), 1)
+
+        audio = (
+            audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data
+        )
+        return audio
+
+    @staticmethod
+    def remove_weightnorm(model):
+        waveglow = model
+        for WN in waveglow.WN:
+            WN.start = torch.nn.utils.remove_weight_norm(WN.start)
+            WN.in_layers = remove(WN.in_layers)
+            WN.cond_layers = remove(WN.cond_layers)
+            WN.res_skip_layers = remove(WN.res_skip_layers)
+        return waveglow
+
+
+def remove(conv_list):
+    new_conv_list = torch.nn.ModuleList()
+    for old_conv in conv_list:
+        old_conv = torch.nn.utils.remove_weight_norm(old_conv)
+        new_conv_list.append(old_conv)
+    return new_conv_list
--- a/taco2/denoiser.py
+++ b/taco2/denoiser.py
@@ -7,19 +7,19 @@ class Denoiser(torch.nn.Module):
    """ Removes model bias from audio produced with waveglow """

    def __init__(self, waveglow, filter_length=1024, n_overlap=4,
-                 win_length=1024, mode='zeros'):
+                 win_length=1024, mode='zeros', n_mel_channels=80,):
        super(Denoiser, self).__init__()
        self.stft = STFT(filter_length=filter_length,
                         hop_length=int(filter_length/n_overlap),
                         win_length=win_length).cpu()
        if mode == 'zeros':
            mel_input = torch.zeros(
-                (1, 80, 88),
+                (1, n_mel_channels, 88),
                dtype=waveglow.upsample.weight.dtype,
                device=waveglow.upsample.weight.device)
        elif mode == 'normal':
            mel_input = torch.randn(
-                (1, 80, 88),
+                (1, n_mel_channels, 88),
                dtype=waveglow.upsample.weight.dtype,
                device=waveglow.upsample.weight.device)
        else:
--- a/taco2/hparams.py
+++ b/taco2/hparams.py
@@ -2,76 +2,79 @@
 # import tensorflow as tf
 from dataclasses import dataclass
 from .text import symbols
+
 # from .text_codec import symbols

+
@dataclass
 class HParams(object):
    """docstring for HParams."""
+
    ################################
    # Experiment Parameters        #
    ################################
-    epochs=500
-    iters_per_checkpoint=1000
-    seed=1234
-    dynamic_loss_scaling=True
-    fp16_run=False
-    distributed_run=False
-    dist_backend="nccl"
-    dist_url="tcp://localhost:54321"
-    cudnn_enabled=True
-    cudnn_benchmark=False
-    ignore_layers=["embedding.weight"]
+    epochs = 500
+    iters_per_checkpoint = 1000
+    seed = 1234
+    dynamic_loss_scaling = True
+    fp16_run = False
+    distributed_run = False
+    dist_backend = "nccl"
+    dist_url = "tcp://localhost:54321"
+    cudnn_enabled = True
+    cudnn_benchmark = False
+    ignore_layers = ["embedding.weight"]
    ################################
    # Data Parameters             #
    ################################
-    load_mel_from_disk=False
-    training_files="lists/tts_data_train_processed.txt"
-    validation_files="filelists/tts_data_val_processed.txt"
-    text_cleaners=["english_cleaners"]
+    load_mel_from_disk = False
+    training_files = "lists/tts_data_train_processed.txt"
+    validation_files = "filelists/tts_data_val_processed.txt"
+    text_cleaners = ["english_cleaners"]
    ################################
    # Audio Parameters             #
    ################################
-    max_wav_value=32768.0
-    sampling_rate=16000
-    filter_length=1024
-    hop_length=256
-    win_length=1024
-    n_mel_channels=80
-    mel_fmin=0.0
-    mel_fmax=8000.0
+    max_wav_value = 32768.0
+    sampling_rate = 16000
+    filter_length = 1024
+    hop_length = 256
+    win_length = 1024
+    n_mel_channels: int = 40
+    mel_fmin: float = 0.0
+    mel_fmax: float = 4000.0
    ################################
    # Model Parameters             #
    ################################
-    n_symbols=len(symbols)
-    symbols_embedding_dim=512
+    n_symbols = len(symbols)
+    symbols_embedding_dim = 512
    # Encoder parameters
-    encoder_kernel_size=5
-    encoder_n_convolutions=3
-    encoder_embedding_dim=512
+    encoder_kernel_size = 5
+    encoder_n_convolutions = 3
+    encoder_embedding_dim = 512
    # Decoder parameters
-    n_frames_per_step=1  # currently only 1 is supported
-    decoder_rnn_dim=1024
-    prenet_dim=256
-    max_decoder_steps=1000
-    gate_threshold=0.5
-    p_attention_dropout=0.1
-    p_decoder_dropout=0.1
+    n_frames_per_step = 1  # currently only 1 is supported
+    decoder_rnn_dim = 1024
+    prenet_dim = 256
+    max_decoder_steps = 1000
+    gate_threshold = 0.5
+    p_attention_dropout = 0.1
+    p_decoder_dropout = 0.1
    # Attention parameters
-    attention_rnn_dim=1024
-    attention_dim=128
+    attention_rnn_dim = 1024
+    attention_dim = 128
    # Location Layer parameters
-    attention_location_n_filters=32
-    attention_location_kernel_size=31
+    attention_location_n_filters = 32
+    attention_location_kernel_size = 31
    # Mel-post processing network parameters
-    postnet_embedding_dim=512
-    postnet_kernel_size=5
-    postnet_n_convolutions=5
+    postnet_embedding_dim = 512
+    postnet_kernel_size = 5
+    postnet_n_convolutions = 5
    ################################
    # Optimization Hyperparameters #
    ################################
-    use_saved_learning_rate=False
-    learning_rate=1e-3
-    weight_decay=1e-6
-    grad_clip_thresh=1.0
-    batch_size=4
-    mask_padding=True  # set model's padded outputs to padded values
+    use_saved_learning_rate = False
+    learning_rate = 1e-3
+    weight_decay = 1e-6
+    grad_clip_thresh = 1.0
+    batch_size = 4
+    mask_padding = True  # set model's padded outputs to padded values