integrate tacotron2/waveglow based tts server

2026-03-08 01:32:35 +00:00 · 2019-07-03 15:08:00 +05:30
parent 4be2475cc1
commit 5f75aa0a0d
26 changed files with 297 additions and 2528 deletions
--- a/29
+++ b/29
@@ -1,29 +0,0 @@
-BSD 3-Clause License
-
-Copyright (c) 2018, NVIDIA Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-* Neither the name of the copyright holder nor the names of its
-  contributors may be used to endorse or promote products derived from
-  this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/README.md
+++ b/README.md
@@ -1,23 +0,0 @@
-
-## Setup
- clone the repo
-
-`git clone https://github.com/agaralabs/tacotron2`
- cd to `tacotron2` copy models from wolverine:
-
-`scp wolverine:/home/ubuntu/tacotron2/{checkpoint_15000,waveglow_256channels.pt} ./`
-
-`scp wolverine:/home/ubuntu/tacotron2/waveglow ./`
-
-**Wolverine Details:**
-```
-Host wolverine
-    Hostname 54.71.137.17
-    User ubuntu
-    IdentityFile ~/.ssh/id_hip_ml
-```
-install the dependencies
-`pip install requirements.txt`
-
-## Running:
-`python final.py`
--- a/audio_processing.py
+++ b/audio_processing.py
@@ -1,93 +0,0 @@
-import torch
-import numpy as np
-from scipy.signal import get_window
-import librosa.util as librosa_util
-
-
-def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
-                     n_fft=800, dtype=np.float32, norm=None):
-    """
-    # from librosa 0.6
-    Compute the sum-square envelope of a window function at a given hop length.
-
-    This is used to estimate modulation effects induced by windowing
-    observations in short-time fourier transforms.
-
-    Parameters
-    ----------
-    window : string, tuple, number, callable, or list-like
-        Window specification, as in `get_window`
-
-    n_frames : int > 0
-        The number of analysis frames
-
-    hop_length : int > 0
-        The number of samples to advance between frames
-
-    win_length : [optional]
-        The length of the window function.  By default, this matches `n_fft`.
-
-    n_fft : int > 0
-        The length of each analysis frame.
-
-    dtype : np.dtype
-        The data type of the output
-
-    Returns
-    -------
-    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
-        The sum-squared envelope of the window function
-    """
-    if win_length is None:
-        win_length = n_fft
-
-    n = n_fft + hop_length * (n_frames - 1)
-    x = np.zeros(n, dtype=dtype)
-
-    # Compute the squared window at the desired length
-    win_sq = get_window(window, win_length, fftbins=True)
-    win_sq = librosa_util.normalize(win_sq, norm=norm)**2
-    win_sq = librosa_util.pad_center(win_sq, n_fft)
-
-    # Fill the envelope
-    for i in range(n_frames):
-        sample = i * hop_length
-        x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
-    return x
-
-
-def griffin_lim(magnitudes, stft_fn, n_iters=30):
-    """
-    PARAMS
-    ------
-    magnitudes: spectrogram magnitudes
-    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
-    """
-
-    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
-    angles = angles.astype(np.float32)
-    angles = torch.autograd.Variable(torch.from_numpy(angles))
-    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
-
-    for i in range(n_iters):
-        _, angles = stft_fn.transform(signal)
-        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
-    return signal
-
-
-def dynamic_range_compression(x, C=1, clip_val=1e-5):
-    """
-    PARAMS
-    ------
-    C: compression factor
-    """
-    return torch.log(torch.clamp(x, min=clip_val) * C)
-
-
-def dynamic_range_decompression(x, C=1):
-    """
-    PARAMS
-    ------
-    C: compression factor used to compress
-    """
-    return torch.exp(x) / C
--- a/corpus.txt
+++ b/corpus.txt
@@ -1,28 +0,0 @@
-Thank you for calling Pampers.
-How may I help you today?
-I understand your frustration and disappointment.
-I'm sorry it's happening and I'd like to help prevent it in the future.
-What style of Baby Dry did you buy?
-Was it the Regular or the Flexible?
-I have all the information I need about the specifics of the product you purchased now.
-Thank you for your patience!
-How many diapers came in the package???
-And what size were they?
-Were they the small. The medium or the large ones???
-Sorry, without the size and count information I will be able to reimburse you with only a minimum fulfillment.
-Would that be okay??
-So you bought the Pampers Baby Dry and there were 32 diapers in the package.
-Is that correct?
-Thank you for all that information.'
-I will definitely pass on your detailed feedback to our Quality Control Team!
-I could also suggest a different variant of diapers, that might better suit your needs.
-Would you like me to help you with that?????
-How old is your little one?
-And do you have specific diaper needs that you can help me with??
-Our cruzers are made especially for active babies, and I would definitely suggest them since you said your little one moves around a lot.
-What I can do for you is, I can also include a coupon towards your next purchase of Pampers products.
-Shall I go ahead with this?
-I can send you a link over text message,,, from which you can directly order this.
-Is there anything else I can help you with today?
-Thank you for reaching out to us.
-Have a good day!!! Bye!
--- a/data_utils.py
+++ b/data_utils.py
@@ -1,111 +0,0 @@
-import random
-import numpy as np
-import torch
-import torch.utils.data
-
-import layers
-from utils import load_wav_to_torch, load_filepaths_and_text
-from text import text_to_sequence
-
-
-class TextMelLoader(torch.utils.data.Dataset):
-    """
-        1) loads audio,text pairs
-        2) normalizes text and converts them to sequences of one-hot vectors
-        3) computes mel-spectrograms from audio files.
-    """
-    def __init__(self, audiopaths_and_text, hparams):
-        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
-        self.text_cleaners = hparams.text_cleaners
-        self.max_wav_value = hparams.max_wav_value
-        self.sampling_rate = hparams.sampling_rate
-        self.load_mel_from_disk = hparams.load_mel_from_disk
-        self.stft = layers.TacotronSTFT(
-            hparams.filter_length, hparams.hop_length, hparams.win_length,
-            hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
-            hparams.mel_fmax)
-        random.seed(1234)
-        random.shuffle(self.audiopaths_and_text)
-
-    def get_mel_text_pair(self, audiopath_and_text):
-        # separate filename and text
-        audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
-        text = self.get_text(text)
-        mel = self.get_mel(audiopath)
-        return (text, mel)
-
-    def get_mel(self, filename):
-        if not self.load_mel_from_disk:
-            audio, sampling_rate = load_wav_to_torch(filename)
-            if sampling_rate != self.stft.sampling_rate:
-                raise ValueError("{} {} SR doesn't match target {} SR".format(
-                    sampling_rate, self.stft.sampling_rate))
-            audio_norm = audio / self.max_wav_value
-            audio_norm = audio_norm.unsqueeze(0)
-            audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
-            melspec = self.stft.mel_spectrogram(audio_norm)
-            melspec = torch.squeeze(melspec, 0)
-        else:
-            melspec = torch.from_numpy(np.load(filename))
-            assert melspec.size(0) == self.stft.n_mel_channels, (
-                'Mel dimension mismatch: given {}, expected {}'.format(
-                    melspec.size(0), self.stft.n_mel_channels))
-
-        return melspec
-
-    def get_text(self, text):
-        text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners))
-        return text_norm
-
-    def __getitem__(self, index):
-        return self.get_mel_text_pair(self.audiopaths_and_text[index])
-
-    def __len__(self):
-        return len(self.audiopaths_and_text)
-
-
-class TextMelCollate():
-    """ Zero-pads model inputs and targets based on number of frames per setep
-    """
-    def __init__(self, n_frames_per_step):
-        self.n_frames_per_step = n_frames_per_step
-
-    def __call__(self, batch):
-        """Collate's training batch from normalized text and mel-spectrogram
-        PARAMS
-        ------
-        batch: [text_normalized, mel_normalized]
-        """
-        # Right zero-pad all one-hot text sequences to max input length
-        input_lengths, ids_sorted_decreasing = torch.sort(
-            torch.LongTensor([len(x[0]) for x in batch]),
-            dim=0, descending=True)
-        max_input_len = input_lengths[0]
-
-        text_padded = torch.LongTensor(len(batch), max_input_len)
-        text_padded.zero_()
-        for i in range(len(ids_sorted_decreasing)):
-            text = batch[ids_sorted_decreasing[i]][0]
-            text_padded[i, :text.size(0)] = text
-
-        # Right zero-pad mel-spec
-        num_mels = batch[0][1].size(0)
-        max_target_len = max([x[1].size(1) for x in batch])
-        if max_target_len % self.n_frames_per_step != 0:
-            max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
-            assert max_target_len % self.n_frames_per_step == 0
-
-        # include mel padded and gate padded
-        mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len)
-        mel_padded.zero_()
-        gate_padded = torch.FloatTensor(len(batch), max_target_len)
-        gate_padded.zero_()
-        output_lengths = torch.LongTensor(len(batch))
-        for i in range(len(ids_sorted_decreasing)):
-            mel = batch[ids_sorted_decreasing[i]][1]
-            mel_padded[i, :, :mel.size(1)] = mel
-            gate_padded[i, mel.size(1)-1:] = 1
-            output_lengths[i] = mel.size(1)
-
-        return text_padded, input_lengths, mel_padded, gate_padded, \
-            output_lengths
--- a/demo_client.py
+++ b/demo_client.py
@@ -1,27 +0,0 @@
-import grpc
-from sia.proto import tts_pb2
-from sia.proto import tts_pb2_grpc
-from tts import player_gen
-
-
-def tts_player():
-    player = player_gen()
-    channel = grpc.insecure_channel('localhost:50060')
-    stub = tts_pb2_grpc.ServerStub(channel)
-
-    def play(t):
-        test_text = tts_pb2.TextInput(text=t)
-        speech = stub.TextToSpeechAPI(test_text)
-        player(speech.response)
-    return play
-
-
-def main():
-    play = tts_player()
-    play('How may I help you today?')
-    import ipdb
-    ipdb.set_trace()
-
-
-if __name__ == '__main__':
-    main()
--- a/distributed.py
+++ b/distributed.py
@@ -1,173 +0,0 @@
-import torch
-import torch.distributed as dist
-from torch.nn.modules import Module
-from torch.autograd import Variable
-
-def _flatten_dense_tensors(tensors):
-    """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
-    same dense type.
-    Since inputs are dense, the resulting tensor will be a concatenated 1D
-    buffer. Element-wise operation on this buffer will be equivalent to
-    operating individually.
-    Arguments:
-        tensors (Iterable[Tensor]): dense tensors to flatten.
-    Returns:
-        A contiguous 1D buffer containing input tensors.
-    """
-    if len(tensors) == 1:
-        return tensors[0].contiguous().view(-1)
-    flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
-    return flat
-
-def _unflatten_dense_tensors(flat, tensors):
-    """View a flat buffer using the sizes of tensors. Assume that tensors are of
-    same dense type, and that flat is given by _flatten_dense_tensors.
-    Arguments:
-        flat (Tensor): flattened dense tensors to unflatten.
-        tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
-          unflatten flat.
-    Returns:
-        Unflattened dense tensors with sizes same as tensors and values from
-        flat.
-    """
-    outputs = []
-    offset = 0
-    for tensor in tensors:
-        numel = tensor.numel()
-        outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
-        offset += numel
-    return tuple(outputs)
-
-
-'''
-This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py
-launcher included with this example. It assumes that your run is using multiprocess with 1
-GPU/process, that the model is on the correct device, and that torch.set_device has been
-used to set the device.
-
-Parameters are broadcasted to the other processes on initialization of DistributedDataParallel,
-and will be allreduced at the finish of the backward pass.
-'''
-class DistributedDataParallel(Module):
-
-    def __init__(self, module):
-        super(DistributedDataParallel, self).__init__()
-        #fallback for PyTorch 0.3
-        if not hasattr(dist, '_backend'):
-            self.warn_on_half = True
-        else:
-            self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
-
-        self.module = module
-
-        for p in self.module.state_dict().values():
-            if not torch.is_tensor(p):
-                continue
-            dist.broadcast(p, 0)
-
-        def allreduce_params():
-            if(self.needs_reduction):
-                self.needs_reduction = False
-                buckets = {}
-                for param in self.module.parameters():
-                    if param.requires_grad and param.grad is not None:
-                        tp = type(param.data)
-                        if tp not in buckets:
-                            buckets[tp] = []
-                        buckets[tp].append(param)
-                if self.warn_on_half:
-                    if torch.cuda.HalfTensor in buckets:
-                        print("WARNING: gloo dist backend for half parameters may be extremely slow." +
-                              " It is recommended to use the NCCL backend in this case. This currently requires" +
-                              "PyTorch built from top of tree master.")
-                        self.warn_on_half = False
-
-                for tp in buckets:
-                    bucket = buckets[tp]
-                    grads = [param.grad.data for param in bucket]
-                    coalesced = _flatten_dense_tensors(grads)
-                    dist.all_reduce(coalesced)
-                    coalesced /= dist.get_world_size()
-                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
-                        buf.copy_(synced)
-
-        for param in list(self.module.parameters()):
-            def allreduce_hook(*unused):
-                param._execution_engine.queue_callback(allreduce_params)
-            if param.requires_grad:
-                param.register_hook(allreduce_hook)
-
-    def forward(self, *inputs, **kwargs):
-        self.needs_reduction = True
-        return self.module(*inputs, **kwargs)
-
-    '''
-    def _sync_buffers(self):
-        buffers = list(self.module._all_buffers())
-        if len(buffers) > 0:
-            # cross-node buffer sync
-            flat_buffers = _flatten_dense_tensors(buffers)
-            dist.broadcast(flat_buffers, 0)
-            for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
-                buf.copy_(synced)
-     def train(self, mode=True):
-        # Clear NCCL communicator and CUDA event cache of the default group ID,
-        # These cache will be recreated at the later call. This is currently a
-        # work-around for a potential NCCL deadlock.
-        if dist._backend == dist.dist_backend.NCCL:
-            dist._clear_group_cache()
-        super(DistributedDataParallel, self).train(mode)
-        self.module.train(mode)
-    '''
-'''
-Modifies existing model to do gradient allreduce, but doesn't change class
-so you don't need "module"
-'''
-def apply_gradient_allreduce(module):
-        if not hasattr(dist, '_backend'):
-            module.warn_on_half = True
-        else:
-            module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
-
-        for p in module.state_dict().values():
-            if not torch.is_tensor(p):
-                continue
-            dist.broadcast(p, 0)
-
-        def allreduce_params():
-            if(module.needs_reduction):
-                module.needs_reduction = False
-                buckets = {}
-                for param in module.parameters():
-                    if param.requires_grad and param.grad is not None:
-                        tp = param.data.dtype
-                        if tp not in buckets:
-                            buckets[tp] = []
-                        buckets[tp].append(param)
-                if module.warn_on_half:
-                    if torch.cuda.HalfTensor in buckets:
-                        print("WARNING: gloo dist backend for half parameters may be extremely slow." +
-                              " It is recommended to use the NCCL backend in this case. This currently requires" +
-                              "PyTorch built from top of tree master.")
-                        module.warn_on_half = False
-
-                for tp in buckets:
-                    bucket = buckets[tp]
-                    grads = [param.grad.data for param in bucket]
-                    coalesced = _flatten_dense_tensors(grads)
-                    dist.all_reduce(coalesced)
-                    coalesced /= dist.get_world_size()
-                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
-                        buf.copy_(synced)
-
-        for param in list(module.parameters()):
-            def allreduce_hook(*unused):
-                Variable._execution_engine.queue_callback(allreduce_params)
-            if param.requires_grad:
-                param.register_hook(allreduce_hook)
-
-        def set_needs_reduction(self, input, output):
-            self.needs_reduction = True
-
-        module.register_forward_hook(set_needs_reduction)
-        return module
--- a/glow.py
+++ b/glow.py
@@ -1,311 +0,0 @@
-# *****************************************************************************
-#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-#  Redistribution and use in source and binary forms, with or without
-#  modification, are permitted provided that the following conditions are met:
-#      * Redistributions of source code must retain the above copyright
-#        notice, this list of conditions and the following disclaimer.
-#      * Redistributions in binary form must reproduce the above copyright
-#        notice, this list of conditions and the following disclaimer in the
-#        documentation and/or other materials provided with the distribution.
-#      * Neither the name of the NVIDIA CORPORATION nor the
-#        names of its contributors may be used to endorse or promote products
-#        derived from this software without specific prior written permission.
-#
-#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
-#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-# *****************************************************************************
-import copy
-import torch
-from torch.autograd import Variable
-import torch.nn.functional as F
-
-
-@torch.jit.script
-def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
-    n_channels_int = n_channels[0]
-    in_act = input_a+input_b
-    t_act = torch.nn.functional.tanh(in_act[:, :n_channels_int, :])
-    s_act = torch.nn.functional.sigmoid(in_act[:, n_channels_int:, :])
-    acts = t_act * s_act
-    return acts
-
-
-class WaveGlowLoss(torch.nn.Module):
-    def __init__(self, sigma=1.0):
-        super(WaveGlowLoss, self).__init__()
-        self.sigma = sigma
-
-    def forward(self, model_output):
-        z, log_s_list, log_det_W_list = model_output
-        for i, log_s in enumerate(log_s_list):
-            if i == 0:
-                log_s_total = torch.sum(log_s)
-                log_det_W_total = log_det_W_list[i]
-            else:
-                log_s_total = log_s_total + torch.sum(log_s)
-                log_det_W_total += log_det_W_list[i]
-
-        loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total
-        return loss/(z.size(0)*z.size(1)*z.size(2))
-
-
-class Invertible1x1Conv(torch.nn.Module):
-    """
-    The layer outputs both the convolution, and the log determinant
-    of its weight matrix.  If reverse=True it does convolution with
-    inverse
-    """
-    def __init__(self, c):
-        super(Invertible1x1Conv, self).__init__()
-        self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0,
-                                    bias=False)
-
-        # Sample a random orthonormal matrix to initialize weights
-        W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
-
-        # Ensure determinant is 1.0 not -1.0
-        if torch.det(W) < 0:
-            W[:,0] = -1*W[:,0]
-        W = W.view(c, c, 1)
-        self.conv.weight.data = W
-
-    def forward(self, z, reverse=False):
-        # shape
-        batch_size, group_size, n_of_groups = z.size()
-
-        W = self.conv.weight.squeeze()
-
-        if reverse:
-            if not hasattr(self, 'W_inverse'):
-                # Reverse computation
-                W_inverse = W.inverse()
-                W_inverse = Variable(W_inverse[..., None])
-                if z.type() == 'torch.cuda.HalfTensor':
-                    W_inverse = W_inverse.half()
-                self.W_inverse = W_inverse
-            z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
-            return z
-        else:
-            # Forward computation
-            log_det_W = batch_size * n_of_groups * torch.logdet(W)
-            z = self.conv(z)
-            return z, log_det_W
-
-
-class WN(torch.nn.Module):
-    """
-    This is the WaveNet like layer for the affine coupling.  The primary difference
-    from WaveNet is the convolutions need not be causal.  There is also no dilation
-    size reset.  The dilation only doubles on each layer
-    """
-    def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
-                 kernel_size):
-        super(WN, self).__init__()
-        assert(kernel_size % 2 == 1)
-        assert(n_channels % 2 == 0)
-        self.n_layers = n_layers
-        self.n_channels = n_channels
-        self.in_layers = torch.nn.ModuleList()
-        self.res_skip_layers = torch.nn.ModuleList()
-        self.cond_layers = torch.nn.ModuleList()
-
-        start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
-        start = torch.nn.utils.weight_norm(start, name='weight')
-        self.start = start
-
-        # Initializing last layer to 0 makes the affine coupling layers
-        # do nothing at first.  This helps with training stability
-        end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1)
-        end.weight.data.zero_()
-        end.bias.data.zero_()
-        self.end = end
-
-        for i in range(n_layers):
-            dilation = 2 ** i
-            padding = int((kernel_size*dilation - dilation)/2)
-            in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size,
-                                       dilation=dilation, padding=padding)
-            in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
-            self.in_layers.append(in_layer)
-
-            cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels, 1)
-            cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
-            self.cond_layers.append(cond_layer)
-
-            # last one is not necessary
-            if i < n_layers - 1:
-                res_skip_channels = 2*n_channels
-            else:
-                res_skip_channels = n_channels
-            res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
-            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
-            self.res_skip_layers.append(res_skip_layer)
-
-    def forward(self, forward_input):
-        audio, spect = forward_input
-        audio = self.start(audio)
-        for i in range(self.n_layers):
-            acts = fused_add_tanh_sigmoid_multiply(
-                self.in_layers[i](audio),
-                self.cond_layers[i](spect),
-                torch.IntTensor([self.n_channels]))
-
-            res_skip_acts = self.res_skip_layers[i](acts)
-            if i < self.n_layers - 1:
-                audio = res_skip_acts[:,:self.n_channels,:] + audio
-                skip_acts = res_skip_acts[:,self.n_channels:,:]
-            else:
-                skip_acts = res_skip_acts
-
-            if i == 0:
-                output = skip_acts
-            else:
-                output = skip_acts + output
-        return self.end(output)
-
-
-class WaveGlow(torch.nn.Module):
-    def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
-                 n_early_size, WN_config):
-        super(WaveGlow, self).__init__()
-
-        self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
-                                                 n_mel_channels,
-                                                 1024, stride=256)
-        assert(n_group % 2 == 0)
-        self.n_flows = n_flows
-        self.n_group = n_group
-        self.n_early_every = n_early_every
-        self.n_early_size = n_early_size
-        self.WN = torch.nn.ModuleList()
-        self.convinv = torch.nn.ModuleList()
-
-        n_half = int(n_group/2)
-
-        # Set up layers with the right sizes based on how many dimensions
-        # have been output already
-        n_remaining_channels = n_group
-        for k in range(n_flows):
-            if k % self.n_early_every == 0 and k > 0:
-                n_half = n_half - int(self.n_early_size/2)
-                n_remaining_channels = n_remaining_channels - self.n_early_size
-            self.convinv.append(Invertible1x1Conv(n_remaining_channels))
-            self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config))
-        self.n_remaining_channels = n_remaining_channels  # Useful during inference
-
-    def forward(self, forward_input):
-        """
-        forward_input[0] = mel_spectrogram:  batch x n_mel_channels x frames
-        forward_input[1] = audio: batch x time
-        """
-        spect, audio = forward_input
-
-        #  Upsample spectrogram to size of audio
-        spect = self.upsample(spect)
-        assert(spect.size(2) >= audio.size(1))
-        if spect.size(2) > audio.size(1):
-            spect = spect[:, :, :audio.size(1)]
-
-        spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
-        spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
-
-        audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
-        output_audio = []
-        log_s_list = []
-        log_det_W_list = []
-
-        for k in range(self.n_flows):
-            if k % self.n_early_every == 0 and k > 0:
-                output_audio.append(audio[:,:self.n_early_size,:])
-                audio = audio[:,self.n_early_size:,:]
-
-            audio, log_det_W = self.convinv[k](audio)
-            log_det_W_list.append(log_det_W)
-
-            n_half = int(audio.size(1)/2)
-            audio_0 = audio[:,:n_half,:]
-            audio_1 = audio[:,n_half:,:]
-
-            output = self.WN[k]((audio_0, spect))
-            log_s = output[:, n_half:, :]
-            b = output[:, :n_half, :]
-            audio_1 = torch.exp(log_s)*audio_1 + b
-            log_s_list.append(log_s)
-
-            audio = torch.cat([audio_0, audio_1],1)
-
-        output_audio.append(audio)
-        return torch.cat(output_audio,1), log_s_list, log_det_W_list
-
-    def infer(self, spect, sigma=1.0):
-        spect = self.upsample(spect)
-        # trim conv artifacts. maybe pad spec to kernel multiple
-        time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
-        spect = spect[:, :, :-time_cutoff]
-
-        spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
-        spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
-
-        if spect.type() == 'torch.cuda.HalfTensor':
-            audio = torch.cuda.HalfTensor(spect.size(0),
-                                          self.n_remaining_channels,
-                                          spect.size(2)).normal_()
-        else:
-            # cuda.FloatTensor -> FloatTensor
-            audio = torch.FloatTensor(spect.size(0),
-                                           self.n_remaining_channels,
-                                           spect.size(2)).normal_()
-
-        audio = torch.autograd.Variable(sigma*audio)
-
-        for k in reversed(range(self.n_flows)):
-            n_half = int(audio.size(1)/2)
-            audio_0 = audio[:,:n_half,:]
-            audio_1 = audio[:,n_half:,:]
-
-            output = self.WN[k]((audio_0, spect))
-            s = output[:, n_half:, :]
-            b = output[:, :n_half, :]
-            audio_1 = (audio_1 - b)/torch.exp(s)
-            audio = torch.cat([audio_0, audio_1],1)
-
-            audio = self.convinv[k](audio, reverse=True)
-
-            if k % self.n_early_every == 0 and k > 0:
-                if spect.type() == 'torch.cuda.HalfTensor':
-                    z = torch.cuda.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
-                else:
-                    # cuda.FloatTensor -> FloatTensor
-                    z = torch.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
-                audio = torch.cat((sigma*z, audio),1)
-
-        audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
-        return audio
-
-    @staticmethod
-    def remove_weightnorm(model):
-        waveglow = model
-        for WN in waveglow.WN:
-            WN.start = torch.nn.utils.remove_weight_norm(WN.start)
-            WN.in_layers = remove(WN.in_layers)
-            WN.cond_layers = remove(WN.cond_layers)
-            WN.res_skip_layers = remove(WN.res_skip_layers)
-        return waveglow
-
-
-def remove(conv_list):
-    new_conv_list = torch.nn.ModuleList()
-    for old_conv in conv_list:
-        old_conv = torch.nn.utils.remove_weight_norm(old_conv)
-        new_conv_list.append(old_conv)
-    return new_conv_list
--- a/hparams.py
+++ b/hparams.py
@@ -1,96 +0,0 @@
-import tensorflow as tf
-from text import symbols
-
-
-# changed path, sampling rate and batch size
-def create_hparams(hparams_string=None, verbose=False):
-    """Create model hyperparameters. Parse nondefault from given string."""
-
-    hparams = tf.contrib.training.HParams(
-        ################################
-        # Experiment Parameters        #
-        ################################
-        epochs=500,
-        iters_per_checkpoint=1000,
-        seed=1234,
-        dynamic_loss_scaling=True,
-        fp16_run=False,
-        distributed_run=False,
-        dist_backend="nccl",
-        dist_url="tcp://localhost:54321",
-        cudnn_enabled=True,
-        cudnn_benchmark=False,
-        ignore_layers=['embedding.weight'],
-
-        ################################
-        # Data Parameters             #
-        ################################
-        load_mel_from_disk=False,
-        training_files='lists/tts_data_train_processed.txt',
-        validation_files='filelists/tts_data_val_processed.txt',
-        text_cleaners=['english_cleaners'],
-
-        ################################
-        # Audio Parameters             #
-        ################################
-        max_wav_value=32768.0,
-        sampling_rate=16000,
-        filter_length=1024,
-        hop_length=256,
-        win_length=1024,
-        n_mel_channels=80,
-        mel_fmin=0.0,
-        mel_fmax=8000.0,
-
-        ################################
-        # Model Parameters             #
-        ################################
-        n_symbols=len(symbols),
-        symbols_embedding_dim=512,
-
-        # Encoder parameters
-        encoder_kernel_size=5,
-        encoder_n_convolutions=3,
-        encoder_embedding_dim=512,
-
-        # Decoder parameters
-        n_frames_per_step=1,  # currently only 1 is supported
-        decoder_rnn_dim=1024,
-        prenet_dim=256,
-        max_decoder_steps=1000,
-        gate_threshold=0.5,
-        p_attention_dropout=0.1,
-        p_decoder_dropout=0.1,
-
-        # Attention parameters
-        attention_rnn_dim=1024,
-        attention_dim=128,
-
-        # Location Layer parameters
-        attention_location_n_filters=32,
-        attention_location_kernel_size=31,
-
-        # Mel-post processing network parameters
-        postnet_embedding_dim=512,
-        postnet_kernel_size=5,
-        postnet_n_convolutions=5,
-
-        ################################
-        # Optimization Hyperparameters #
-        ################################
-        use_saved_learning_rate=False,
-        learning_rate=1e-3,
-        weight_decay=1e-6,
-        grad_clip_thresh=1.0,
-        batch_size=4,
-        mask_padding=True  # set model's padded outputs to padded values
-    )
-
-    if hparams_string:
-        tf.logging.info('Parsing command line hparams: %s', hparams_string)
-        hparams.parse(hparams_string)
-
-    if verbose:
-        tf.logging.info('Final parsed hparams: %s', hparams.values())
-
-    return hparams
--- a/layers.py
+++ b/layers.py
@@ -1,80 +0,0 @@
-import torch
-from librosa.filters import mel as librosa_mel_fn
-from audio_processing import dynamic_range_compression
-from audio_processing import dynamic_range_decompression
-from stft import STFT
-
-
-class LinearNorm(torch.nn.Module):
-    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
-        super(LinearNorm, self).__init__()
-        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
-
-        torch.nn.init.xavier_uniform_(
-            self.linear_layer.weight,
-            gain=torch.nn.init.calculate_gain(w_init_gain))
-
-    def forward(self, x):
-        return self.linear_layer(x)
-
-
-class ConvNorm(torch.nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
-                 padding=None, dilation=1, bias=True, w_init_gain='linear'):
-        super(ConvNorm, self).__init__()
-        if padding is None:
-            assert(kernel_size % 2 == 1)
-            padding = int(dilation * (kernel_size - 1) / 2)
-
-        self.conv = torch.nn.Conv1d(in_channels, out_channels,
-                                    kernel_size=kernel_size, stride=stride,
-                                    padding=padding, dilation=dilation,
-                                    bias=bias)
-
-        torch.nn.init.xavier_uniform_(
-            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
-
-    def forward(self, signal):
-        conv_signal = self.conv(signal)
-        return conv_signal
-
-
-class TacotronSTFT(torch.nn.Module):
-    def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
-                 n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
-                 mel_fmax=8000.0):
-        super(TacotronSTFT, self).__init__()
-        self.n_mel_channels = n_mel_channels
-        self.sampling_rate = sampling_rate
-        self.stft_fn = STFT(filter_length, hop_length, win_length)
-        mel_basis = librosa_mel_fn(
-            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
-        mel_basis = torch.from_numpy(mel_basis).float()
-        self.register_buffer('mel_basis', mel_basis)
-
-    def spectral_normalize(self, magnitudes):
-        output = dynamic_range_compression(magnitudes)
-        return output
-
-    def spectral_de_normalize(self, magnitudes):
-        output = dynamic_range_decompression(magnitudes)
-        return output
-
-    def mel_spectrogram(self, y):
-        """Computes mel-spectrograms from a batch of waves
-        PARAMS
-        ------
-        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
-
-        RETURNS
-        -------
-        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
-        """
-        assert(torch.min(y.data) >= -1)
-        assert(torch.max(y.data) <= 1)
-
-        magnitudes, phases = self.stft_fn.transform(y)
-        magnitudes = magnitudes.data
-        mel_output = torch.matmul(self.mel_basis, magnitudes)
-        mel_output = self.spectral_normalize(mel_output)
-        return mel_output
--- a/logger.py
+++ b/logger.py
@@ -1,48 +0,0 @@
-import random
-import torch
-from tensorboardX import SummaryWriter
-from plotting_utils import plot_alignment_to_numpy, plot_spectrogram_to_numpy
-from plotting_utils import plot_gate_outputs_to_numpy
-
-
-class Tacotron2Logger(SummaryWriter):
-    def __init__(self, logdir):
-        super(Tacotron2Logger, self).__init__(logdir)
-
-    def log_training(self, reduced_loss, grad_norm, learning_rate, duration,
-                     iteration):
-            self.add_scalar("training.loss", reduced_loss, iteration)
-            self.add_scalar("grad.norm", grad_norm, iteration)
-            self.add_scalar("learning.rate", learning_rate, iteration)
-            self.add_scalar("duration", duration, iteration)
-
-    def log_validation(self, reduced_loss, model, y, y_pred, iteration):
-        self.add_scalar("validation.loss", reduced_loss, iteration)
-        _, mel_outputs, gate_outputs, alignments = y_pred
-        mel_targets, gate_targets = y
-
-        # plot distribution of parameters
-        for tag, value in model.named_parameters():
-            tag = tag.replace('.', '/')
-            self.add_histogram(tag, value.data.cpu().numpy(), iteration)
-
-        # plot alignment, mel target and predicted, gate target and predicted
-        idx = random.randint(0, alignments.size(0) - 1)
-        self.add_image(
-            "alignment",
-            plot_alignment_to_numpy(alignments[idx].data.cpu().numpy().T),
-            iteration)
-        self.add_image(
-            "mel_target",
-            plot_spectrogram_to_numpy(mel_targets[idx].data.cpu().numpy()),
-            iteration)
-        self.add_image(
-            "mel_predicted",
-            plot_spectrogram_to_numpy(mel_outputs[idx].data.cpu().numpy()),
-            iteration)
-        self.add_image(
-            "gate",
-            plot_gate_outputs_to_numpy(
-                gate_targets[idx].data.cpu().numpy(),
-                torch.sigmoid(gate_outputs[idx]).data.cpu().numpy()),
-            iteration)
--- a/loss_function.py
+++ b/loss_function.py
@@ -1,19 +0,0 @@
-from torch import nn
-
-
-class Tacotron2Loss(nn.Module):
-    def __init__(self):
-        super(Tacotron2Loss, self).__init__()
-
-    def forward(self, model_output, targets):
-        mel_target, gate_target = targets[0], targets[1]
-        mel_target.requires_grad = False
-        gate_target.requires_grad = False
-        gate_target = gate_target.view(-1, 1)
-
-        mel_out, mel_out_postnet, gate_out, _ = model_output
-        gate_out = gate_out.view(-1, 1)
-        mel_loss = nn.MSELoss()(mel_out, mel_target) + \
-            nn.MSELoss()(mel_out_postnet, mel_target)
-        gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
-        return mel_loss + gate_loss
--- a/model.py
+++ b/model.py
@@ -1,529 +0,0 @@
-from math import sqrt
-import torch
-from torch.autograd import Variable
-from torch import nn
-from torch.nn import functional as F
-from layers import ConvNorm, LinearNorm
-from utils import to_gpu, get_mask_from_lengths
-
-
-class LocationLayer(nn.Module):
-    def __init__(self, attention_n_filters, attention_kernel_size,
-                 attention_dim):
-        super(LocationLayer, self).__init__()
-        padding = int((attention_kernel_size - 1) / 2)
-        self.location_conv = ConvNorm(2, attention_n_filters,
-                                      kernel_size=attention_kernel_size,
-                                      padding=padding, bias=False, stride=1,
-                                      dilation=1)
-        self.location_dense = LinearNorm(attention_n_filters, attention_dim,
-                                         bias=False, w_init_gain='tanh')
-
-    def forward(self, attention_weights_cat):
-        processed_attention = self.location_conv(attention_weights_cat)
-        processed_attention = processed_attention.transpose(1, 2)
-        processed_attention = self.location_dense(processed_attention)
-        return processed_attention
-
-
-class Attention(nn.Module):
-    def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
-                 attention_location_n_filters, attention_location_kernel_size):
-        super(Attention, self).__init__()
-        self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
-                                      bias=False, w_init_gain='tanh')
-        self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
-                                       w_init_gain='tanh')
-        self.v = LinearNorm(attention_dim, 1, bias=False)
-        self.location_layer = LocationLayer(attention_location_n_filters,
-                                            attention_location_kernel_size,
-                                            attention_dim)
-        self.score_mask_value = -float("inf")
-
-    def get_alignment_energies(self, query, processed_memory,
-                               attention_weights_cat):
-        """
-        PARAMS
-        ------
-        query: decoder output (batch, n_mel_channels * n_frames_per_step)
-        processed_memory: processed encoder outputs (B, T_in, attention_dim)
-        attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
-
-        RETURNS
-        -------
-        alignment (batch, max_time)
-        """
-
-        processed_query = self.query_layer(query.unsqueeze(1))
-        processed_attention_weights = self.location_layer(attention_weights_cat)
-        energies = self.v(torch.tanh(
-            processed_query + processed_attention_weights + processed_memory))
-
-        energies = energies.squeeze(-1)
-        return energies
-
-    def forward(self, attention_hidden_state, memory, processed_memory,
-                attention_weights_cat, mask):
-        """
-        PARAMS
-        ------
-        attention_hidden_state: attention rnn last output
-        memory: encoder outputs
-        processed_memory: processed encoder outputs
-        attention_weights_cat: previous and cummulative attention weights
-        mask: binary mask for padded data
-        """
-        alignment = self.get_alignment_energies(
-            attention_hidden_state, processed_memory, attention_weights_cat)
-
-        if mask is not None:
-            alignment.data.masked_fill_(mask, self.score_mask_value)
-
-        attention_weights = F.softmax(alignment, dim=1)
-        attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
-        attention_context = attention_context.squeeze(1)
-
-        return attention_context, attention_weights
-
-
-class Prenet(nn.Module):
-    def __init__(self, in_dim, sizes):
-        super(Prenet, self).__init__()
-        in_sizes = [in_dim] + sizes[:-1]
-        self.layers = nn.ModuleList(
-            [LinearNorm(in_size, out_size, bias=False)
-             for (in_size, out_size) in zip(in_sizes, sizes)])
-
-    def forward(self, x):
-        for linear in self.layers:
-            x = F.dropout(F.relu(linear(x)), p=0.5, training=True)
-        return x
-
-
-class Postnet(nn.Module):
-    """Postnet
-        - Five 1-d convolution with 512 channels and kernel size 5
-    """
-
-    def __init__(self, hparams):
-        super(Postnet, self).__init__()
-        self.convolutions = nn.ModuleList()
-
-        self.convolutions.append(
-            nn.Sequential(
-                ConvNorm(hparams.n_mel_channels, hparams.postnet_embedding_dim,
-                         kernel_size=hparams.postnet_kernel_size, stride=1,
-                         padding=int((hparams.postnet_kernel_size - 1) / 2),
-                         dilation=1, w_init_gain='tanh'),
-                nn.BatchNorm1d(hparams.postnet_embedding_dim))
-        )
-
-        for i in range(1, hparams.postnet_n_convolutions - 1):
-            self.convolutions.append(
-                nn.Sequential(
-                    ConvNorm(hparams.postnet_embedding_dim,
-                             hparams.postnet_embedding_dim,
-                             kernel_size=hparams.postnet_kernel_size, stride=1,
-                             padding=int((hparams.postnet_kernel_size - 1) / 2),
-                             dilation=1, w_init_gain='tanh'),
-                    nn.BatchNorm1d(hparams.postnet_embedding_dim))
-            )
-
-        self.convolutions.append(
-            nn.Sequential(
-                ConvNorm(hparams.postnet_embedding_dim, hparams.n_mel_channels,
-                         kernel_size=hparams.postnet_kernel_size, stride=1,
-                         padding=int((hparams.postnet_kernel_size - 1) / 2),
-                         dilation=1, w_init_gain='linear'),
-                nn.BatchNorm1d(hparams.n_mel_channels))
-            )
-
-    def forward(self, x):
-        for i in range(len(self.convolutions) - 1):
-            x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training)
-        x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
-
-        return x
-
-
-class Encoder(nn.Module):
-    """Encoder module:
-        - Three 1-d convolution banks
-        - Bidirectional LSTM
-    """
-    def __init__(self, hparams):
-        super(Encoder, self).__init__()
-
-        convolutions = []
-        for _ in range(hparams.encoder_n_convolutions):
-            conv_layer = nn.Sequential(
-                ConvNorm(hparams.encoder_embedding_dim,
-                         hparams.encoder_embedding_dim,
-                         kernel_size=hparams.encoder_kernel_size, stride=1,
-                         padding=int((hparams.encoder_kernel_size - 1) / 2),
-                         dilation=1, w_init_gain='relu'),
-                nn.BatchNorm1d(hparams.encoder_embedding_dim))
-            convolutions.append(conv_layer)
-        self.convolutions = nn.ModuleList(convolutions)
-
-        self.lstm = nn.LSTM(hparams.encoder_embedding_dim,
-                            int(hparams.encoder_embedding_dim / 2), 1,
-                            batch_first=True, bidirectional=True)
-
-    def forward(self, x, input_lengths):
-        for conv in self.convolutions:
-            x = F.dropout(F.relu(conv(x)), 0.5, self.training)
-
-        x = x.transpose(1, 2)
-
-        # pytorch tensor are not reversible, hence the conversion
-        input_lengths = input_lengths.cpu().numpy()
-        x = nn.utils.rnn.pack_padded_sequence(
-            x, input_lengths, batch_first=True)
-
-        self.lstm.flatten_parameters()
-        outputs, _ = self.lstm(x)
-
-        outputs, _ = nn.utils.rnn.pad_packed_sequence(
-            outputs, batch_first=True)
-
-        return outputs
-
-    def inference(self, x):
-        for conv in self.convolutions:
-            x = F.dropout(F.relu(conv(x)), 0.5, self.training)
-
-        x = x.transpose(1, 2)
-
-        self.lstm.flatten_parameters()
-        outputs, _ = self.lstm(x)
-
-        return outputs
-
-
-class Decoder(nn.Module):
-    def __init__(self, hparams):
-        super(Decoder, self).__init__()
-        self.n_mel_channels = hparams.n_mel_channels
-        self.n_frames_per_step = hparams.n_frames_per_step
-        self.encoder_embedding_dim = hparams.encoder_embedding_dim
-        self.attention_rnn_dim = hparams.attention_rnn_dim
-        self.decoder_rnn_dim = hparams.decoder_rnn_dim
-        self.prenet_dim = hparams.prenet_dim
-        self.max_decoder_steps = hparams.max_decoder_steps
-        self.gate_threshold = hparams.gate_threshold
-        self.p_attention_dropout = hparams.p_attention_dropout
-        self.p_decoder_dropout = hparams.p_decoder_dropout
-
-        self.prenet = Prenet(
-            hparams.n_mel_channels * hparams.n_frames_per_step,
-            [hparams.prenet_dim, hparams.prenet_dim])
-
-        self.attention_rnn = nn.LSTMCell(
-            hparams.prenet_dim + hparams.encoder_embedding_dim,
-            hparams.attention_rnn_dim)
-
-        self.attention_layer = Attention(
-            hparams.attention_rnn_dim, hparams.encoder_embedding_dim,
-            hparams.attention_dim, hparams.attention_location_n_filters,
-            hparams.attention_location_kernel_size)
-
-        self.decoder_rnn = nn.LSTMCell(
-            hparams.attention_rnn_dim + hparams.encoder_embedding_dim,
-            hparams.decoder_rnn_dim, 1)
-
-        self.linear_projection = LinearNorm(
-            hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
-            hparams.n_mel_channels * hparams.n_frames_per_step)
-
-        self.gate_layer = LinearNorm(
-            hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1,
-            bias=True, w_init_gain='sigmoid')
-
-    def get_go_frame(self, memory):
-        """ Gets all zeros frames to use as first decoder input
-        PARAMS
-        ------
-        memory: decoder outputs
-
-        RETURNS
-        -------
-        decoder_input: all zeros frames
-        """
-        B = memory.size(0)
-        decoder_input = Variable(memory.data.new(
-            B, self.n_mel_channels * self.n_frames_per_step).zero_())
-        return decoder_input
-
-    def initialize_decoder_states(self, memory, mask):
-        """ Initializes attention rnn states, decoder rnn states, attention
-        weights, attention cumulative weights, attention context, stores memory
-        and stores processed memory
-        PARAMS
-        ------
-        memory: Encoder outputs
-        mask: Mask for padded data if training, expects None for inference
-        """
-        B = memory.size(0)
-        MAX_TIME = memory.size(1)
-
-        self.attention_hidden = Variable(memory.data.new(
-            B, self.attention_rnn_dim).zero_())
-        self.attention_cell = Variable(memory.data.new(
-            B, self.attention_rnn_dim).zero_())
-
-        self.decoder_hidden = Variable(memory.data.new(
-            B, self.decoder_rnn_dim).zero_())
-        self.decoder_cell = Variable(memory.data.new(
-            B, self.decoder_rnn_dim).zero_())
-
-        self.attention_weights = Variable(memory.data.new(
-            B, MAX_TIME).zero_())
-        self.attention_weights_cum = Variable(memory.data.new(
-            B, MAX_TIME).zero_())
-        self.attention_context = Variable(memory.data.new(
-            B, self.encoder_embedding_dim).zero_())
-
-        self.memory = memory
-        self.processed_memory = self.attention_layer.memory_layer(memory)
-        self.mask = mask
-
-    def parse_decoder_inputs(self, decoder_inputs):
-        """ Prepares decoder inputs, i.e. mel outputs
-        PARAMS
-        ------
-        decoder_inputs: inputs used for teacher-forced training, i.e. mel-specs
-
-        RETURNS
-        -------
-        inputs: processed decoder inputs
-
-        """
-        # (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels)
-        decoder_inputs = decoder_inputs.transpose(1, 2)
-        decoder_inputs = decoder_inputs.view(
-            decoder_inputs.size(0),
-            int(decoder_inputs.size(1)/self.n_frames_per_step), -1)
-        # (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels)
-        decoder_inputs = decoder_inputs.transpose(0, 1)
-        return decoder_inputs
-
-    def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments):
-        """ Prepares decoder outputs for output
-        PARAMS
-        ------
-        mel_outputs:
-        gate_outputs: gate output energies
-        alignments:
-
-        RETURNS
-        -------
-        mel_outputs:
-        gate_outpust: gate output energies
-        alignments:
-        """
-        # (T_out, B) -> (B, T_out)
-        alignments = torch.stack(alignments).transpose(0, 1)
-        # (T_out, B) -> (B, T_out)
-        gate_outputs = torch.stack(gate_outputs).transpose(0, 1)
-        gate_outputs = gate_outputs.contiguous()
-        # (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels)
-        mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous()
-        # decouple frames per step
-        mel_outputs = mel_outputs.view(
-            mel_outputs.size(0), -1, self.n_mel_channels)
-        # (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out)
-        mel_outputs = mel_outputs.transpose(1, 2)
-
-        return mel_outputs, gate_outputs, alignments
-
-    def decode(self, decoder_input):
-        """ Decoder step using stored states, attention and memory
-        PARAMS
-        ------
-        decoder_input: previous mel output
-
-        RETURNS
-        -------
-        mel_output:
-        gate_output: gate output energies
-        attention_weights:
-        """
-        cell_input = torch.cat((decoder_input, self.attention_context), -1)
-        self.attention_hidden, self.attention_cell = self.attention_rnn(
-            cell_input, (self.attention_hidden, self.attention_cell))
-        self.attention_hidden = F.dropout(
-            self.attention_hidden, self.p_attention_dropout, self.training)
-
-        attention_weights_cat = torch.cat(
-            (self.attention_weights.unsqueeze(1),
-             self.attention_weights_cum.unsqueeze(1)), dim=1)
-        self.attention_context, self.attention_weights = self.attention_layer(
-            self.attention_hidden, self.memory, self.processed_memory,
-            attention_weights_cat, self.mask)
-
-        self.attention_weights_cum += self.attention_weights
-        decoder_input = torch.cat(
-            (self.attention_hidden, self.attention_context), -1)
-        self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
-            decoder_input, (self.decoder_hidden, self.decoder_cell))
-        self.decoder_hidden = F.dropout(
-            self.decoder_hidden, self.p_decoder_dropout, self.training)
-
-        decoder_hidden_attention_context = torch.cat(
-            (self.decoder_hidden, self.attention_context), dim=1)
-        decoder_output = self.linear_projection(
-            decoder_hidden_attention_context)
-
-        gate_prediction = self.gate_layer(decoder_hidden_attention_context)
-        return decoder_output, gate_prediction, self.attention_weights
-
-    def forward(self, memory, decoder_inputs, memory_lengths):
-        """ Decoder forward pass for training
-        PARAMS
-        ------
-        memory: Encoder outputs
-        decoder_inputs: Decoder inputs for teacher forcing. i.e. mel-specs
-        memory_lengths: Encoder output lengths for attention masking.
-
-        RETURNS
-        -------
-        mel_outputs: mel outputs from the decoder
-        gate_outputs: gate outputs from the decoder
-        alignments: sequence of attention weights from the decoder
-        """
-
-        decoder_input = self.get_go_frame(memory).unsqueeze(0)
-        decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
-        decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
-        decoder_inputs = self.prenet(decoder_inputs)
-
-        self.initialize_decoder_states(
-            memory, mask=~get_mask_from_lengths(memory_lengths))
-
-        mel_outputs, gate_outputs, alignments = [], [], []
-        while len(mel_outputs) < decoder_inputs.size(0) - 1:
-            decoder_input = decoder_inputs[len(mel_outputs)]
-            mel_output, gate_output, attention_weights = self.decode(
-                decoder_input)
-            mel_outputs += [mel_output.squeeze(1)]
-            gate_outputs += [gate_output.squeeze()]
-            alignments += [attention_weights]
-
-        mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
-            mel_outputs, gate_outputs, alignments)
-
-        return mel_outputs, gate_outputs, alignments
-
-    def inference(self, memory):
-        """ Decoder inference
-        PARAMS
-        ------
-        memory: Encoder outputs
-
-        RETURNS
-        -------
-        mel_outputs: mel outputs from the decoder
-        gate_outputs: gate outputs from the decoder
-        alignments: sequence of attention weights from the decoder
-        """
-        decoder_input = self.get_go_frame(memory)
-
-        self.initialize_decoder_states(memory, mask=None)
-
-        mel_outputs, gate_outputs, alignments = [], [], []
-        while True:
-            decoder_input = self.prenet(decoder_input)
-            mel_output, gate_output, alignment = self.decode(decoder_input)
-
-            mel_outputs += [mel_output.squeeze(1)]
-            gate_outputs += [gate_output]
-            alignments += [alignment]
-
-            if torch.sigmoid(gate_output.data) > self.gate_threshold:
-                break
-            elif len(mel_outputs) == self.max_decoder_steps:
-                print("Warning! Reached max decoder steps")
-                break
-
-            decoder_input = mel_output
-
-        mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
-            mel_outputs, gate_outputs, alignments)
-
-        return mel_outputs, gate_outputs, alignments
-
-
-class Tacotron2(nn.Module):
-    def __init__(self, hparams):
-        super(Tacotron2, self).__init__()
-        self.mask_padding = hparams.mask_padding
-        self.fp16_run = hparams.fp16_run
-        self.n_mel_channels = hparams.n_mel_channels
-        self.n_frames_per_step = hparams.n_frames_per_step
-        self.embedding = nn.Embedding(
-            hparams.n_symbols, hparams.symbols_embedding_dim)
-        std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))
-        val = sqrt(3.0) * std  # uniform bounds for std
-        self.embedding.weight.data.uniform_(-val, val)
-        self.encoder = Encoder(hparams)
-        self.decoder = Decoder(hparams)
-        self.postnet = Postnet(hparams)
-
-    def parse_batch(self, batch):
-        text_padded, input_lengths, mel_padded, gate_padded, \
-            output_lengths = batch
-        text_padded = to_gpu(text_padded).long()
-        input_lengths = to_gpu(input_lengths).long()
-        max_len = torch.max(input_lengths.data).item()
-        mel_padded = to_gpu(mel_padded).float()
-        gate_padded = to_gpu(gate_padded).float()
-        output_lengths = to_gpu(output_lengths).long()
-
-        return (
-            (text_padded, input_lengths, mel_padded, max_len, output_lengths),
-            (mel_padded, gate_padded))
-
-    def parse_output(self, outputs, output_lengths=None):
-        if self.mask_padding and output_lengths is not None:
-            mask = ~get_mask_from_lengths(output_lengths)
-            mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
-            mask = mask.permute(1, 0, 2)
-
-            outputs[0].data.masked_fill_(mask, 0.0)
-            outputs[1].data.masked_fill_(mask, 0.0)
-            outputs[2].data.masked_fill_(mask[:, 0, :], 1e3)  # gate energies
-
-        return outputs
-
-    def forward(self, inputs):
-        text_inputs, text_lengths, mels, max_len, output_lengths = inputs
-        text_lengths, output_lengths = text_lengths.data, output_lengths.data
-
-        embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
-
-        encoder_outputs = self.encoder(embedded_inputs, text_lengths)
-
-        mel_outputs, gate_outputs, alignments = self.decoder(
-            encoder_outputs, mels, memory_lengths=text_lengths)
-
-        mel_outputs_postnet = self.postnet(mel_outputs)
-        mel_outputs_postnet = mel_outputs + mel_outputs_postnet
-
-        return self.parse_output(
-            [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
-            output_lengths)
-
-    def inference(self, inputs):
-        embedded_inputs = self.embedding(inputs).transpose(1, 2)
-        encoder_outputs = self.encoder.inference(embedded_inputs)
-        mel_outputs, gate_outputs, alignments = self.decoder.inference(
-            encoder_outputs)
-
-        mel_outputs_postnet = self.postnet(mel_outputs)
-        mel_outputs_postnet = mel_outputs + mel_outputs_postnet
-
-        outputs = self.parse_output(
-            [mel_outputs, mel_outputs_postnet, gate_outputs, alignments])
-
-        return outputs
--- a/plotting_utils.py
+++ b/plotting_utils.py
@@ -1,61 +0,0 @@
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pylab as plt
-import numpy as np
-
-
-def save_figure_to_numpy(fig):
-    # save it to a numpy array.
-    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
-    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
-    return data
-
-
-def plot_alignment_to_numpy(alignment, info=None):
-    fig, ax = plt.subplots(figsize=(6, 4))
-    im = ax.imshow(alignment, aspect='auto', origin='lower',
-                   interpolation='none')
-    fig.colorbar(im, ax=ax)
-    xlabel = 'Decoder timestep'
-    if info is not None:
-        xlabel += '\n\n' + info
-    plt.xlabel(xlabel)
-    plt.ylabel('Encoder timestep')
-    plt.tight_layout()
-
-    fig.canvas.draw()
-    data = save_figure_to_numpy(fig)
-    plt.close()
-    return data
-
-
-def plot_spectrogram_to_numpy(spectrogram):
-    fig, ax = plt.subplots(figsize=(12, 3))
-    im = ax.imshow(spectrogram, aspect="auto", origin="lower",
-                   interpolation='none')
-    plt.colorbar(im, ax=ax)
-    plt.xlabel("Frames")
-    plt.ylabel("Channels")
-    plt.tight_layout()
-
-    fig.canvas.draw()
-    data = save_figure_to_numpy(fig)
-    plt.close()
-    return data
-
-
-def plot_gate_outputs_to_numpy(gate_targets, gate_outputs):
-    fig, ax = plt.subplots(figsize=(12, 3))
-    ax.scatter(range(len(gate_targets)), gate_targets, alpha=0.5,
-               color='green', marker='+', s=1, label='target')
-    ax.scatter(range(len(gate_outputs)), gate_outputs, alpha=0.5,
-               color='red', marker='.', s=1, label='predicted')
-
-    plt.xlabel("Frames (Green target, Red predicted)")
-    plt.ylabel("Gate State")
-    plt.tight_layout()
-
-    fig.canvas.draw()
-    data = save_figure_to_numpy(fig)
-    plt.close()
-    return data
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +0,0 @@
-tensorflow
-numpy
-inflect==0.2.5
-librosa==0.6.0
-scipy
-tensorboardX
-Unidecode==1.0.22
-pillow
-torch==1.1.0
-pysoundfile
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -1,15 +0,0 @@
-pip==18.1
-bumpversion==0.5.3
-wheel==0.32.1
-watchdog==0.9.0
-flake8==3.5.0
-tox==3.5.2
-coverage==4.5.1
-Sphinx==1.8.1
-twine==1.12.1
-
-pytest==3.8.2
-pytest-runner==4.2
-pre-commit==1.16.1
-python-language-server[all]
-ipdb
--- a/server.py
+++ b/server.py
@@ -1,39 +0,0 @@
-# -*- coding: utf-8 -*-
-import grpc
-import time
-from sia.proto import tts_pb2
-from sia.proto import tts_pb2_grpc
-from concurrent import futures
-from sia.instruments import do_time
-from tts import TTSModel
-
-
-class TTSServer():
-    def __init__(self):
-        self.tts_model = TTSModel()
-
-    def TextToSpeechAPI(self, request, context):
-        while (True):
-            input_text = request.text
-            speech_response = self.tts_model.synth_speech(input_text)
-            return tts_pb2.SpeechResponse(response=speech_response)
-
-
-def main():
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
-    tts_server = TTSServer()
-    tts_pb2_grpc.add_ServerServicer_to_server(tts_server, server)
-    server.add_insecure_port('localhost:50060')
-    server.start()
-    print('TTSServer started!')
-
-    try:
-        while True:
-            time.sleep(10000)
-    except KeyboardInterrupt:
-        server.start()
-        # server.stop(0)
-
-
-if __name__ == "__main__":
-    main()
--- a/stft.py
+++ b/stft.py
@@ -1,141 +0,0 @@
-"""
-BSD 3-Clause License
-
-Copyright (c) 2017, Prem Seetharaman
-All rights reserved.
-
-* Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice,
-  this list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice, this
-  list of conditions and the following disclaimer in the
-  documentation and/or other materials provided with the distribution.
-
-* Neither the name of the copyright holder nor the names of its
-  contributors may be used to endorse or promote products derived from this
-  software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-"""
-
-import torch
-import numpy as np
-import torch.nn.functional as F
-from torch.autograd import Variable
-from scipy.signal import get_window
-from librosa.util import pad_center, tiny
-from audio_processing import window_sumsquare
-
-
-class STFT(torch.nn.Module):
-    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
-    def __init__(self, filter_length=800, hop_length=200, win_length=800,
-                 window='hann'):
-        super(STFT, self).__init__()
-        self.filter_length = filter_length
-        self.hop_length = hop_length
-        self.win_length = win_length
-        self.window = window
-        self.forward_transform = None
-        scale = self.filter_length / self.hop_length
-        fourier_basis = np.fft.fft(np.eye(self.filter_length))
-
-        cutoff = int((self.filter_length / 2 + 1))
-        fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
-                                   np.imag(fourier_basis[:cutoff, :])])
-
-        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
-        inverse_basis = torch.FloatTensor(
-            np.linalg.pinv(scale * fourier_basis).T[:, None, :])
-
-        if window is not None:
-            assert(filter_length >= win_length)
-            # get window and zero center pad it to filter_length
-            fft_window = get_window(window, win_length, fftbins=True)
-            fft_window = pad_center(fft_window, filter_length)
-            fft_window = torch.from_numpy(fft_window).float()
-
-            # window the bases
-            forward_basis *= fft_window
-            inverse_basis *= fft_window
-
-        self.register_buffer('forward_basis', forward_basis.float())
-        self.register_buffer('inverse_basis', inverse_basis.float())
-
-    def transform(self, input_data):
-        num_batches = input_data.size(0)
-        num_samples = input_data.size(1)
-
-        self.num_samples = num_samples
-
-        # similar to librosa, reflect-pad the input
-        input_data = input_data.view(num_batches, 1, num_samples)
-        input_data = F.pad(
-            input_data.unsqueeze(1),
-            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
-            mode='reflect')
-        input_data = input_data.squeeze(1)
-
-        forward_transform = F.conv1d(
-            input_data,
-            Variable(self.forward_basis, requires_grad=False),
-            stride=self.hop_length,
-            padding=0)
-
-        cutoff = int((self.filter_length / 2) + 1)
-        real_part = forward_transform[:, :cutoff, :]
-        imag_part = forward_transform[:, cutoff:, :]
-
-        magnitude = torch.sqrt(real_part**2 + imag_part**2)
-        phase = torch.autograd.Variable(
-            torch.atan2(imag_part.data, real_part.data))
-
-        return magnitude, phase
-
-    def inverse(self, magnitude, phase):
-        recombine_magnitude_phase = torch.cat(
-            [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
-
-        inverse_transform = F.conv_transpose1d(
-            recombine_magnitude_phase,
-            Variable(self.inverse_basis, requires_grad=False),
-            stride=self.hop_length,
-            padding=0)
-
-        if self.window is not None:
-            window_sum = window_sumsquare(
-                self.window, magnitude.size(-1), hop_length=self.hop_length,
-                win_length=self.win_length, n_fft=self.filter_length,
-                dtype=np.float32)
-            # remove modulation effects
-            approx_nonzero_indices = torch.from_numpy(
-                np.where(window_sum > tiny(window_sum))[0])
-            window_sum = torch.autograd.Variable(
-                torch.from_numpy(window_sum), requires_grad=False)
-            #window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum   #initially not commented out
-            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
-
-            # scale by hop ratio
-            inverse_transform *= float(self.filter_length) / self.hop_length
-
-        inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
-        inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
-
-        return inverse_transform
-
-    def forward(self, input_data):
-        self.magnitude, self.phase = self.transform(input_data)
-        reconstruction = self.inverse(self.magnitude, self.phase)
-        return reconstruction
--- a/text/init.py
+++ b/text/init.py
@@ -1,22 +1,23 @@
+# -*- coding: utf-8 -*-
 """ from https://github.com/keithito/tacotron """
 import re
-from text import cleaners
-from text.symbols import symbols
-
+from . import cleaners
+from .symbols import symbols

 # Mappings from symbol to numeric ID and vice versa:
 _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 _id_to_symbol = {i: s for i, s in enumerate(symbols)}

 # Regular expression matching text enclosed in curly braces:
-_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
+_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")


 def text_to_sequence(text, cleaner_names):
-  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    """Converts a string of text to a sequence of IDs corresponding to the
+    symbols in the text.

-    The text can optionally have ARPAbet sequences enclosed in curly braces embedded
-    in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
+    The text can optionally have ARPAbet sequences enclosed in curly braces
+    embedded in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."

    Args:
      text: string to convert to a sequence
@@ -24,51 +25,53 @@ def text_to_sequence(text, cleaner_names):

    Returns:
      List of integers corresponding to the symbols in the text
-  '''
-  sequence = []
+  """
+    sequence = []

-  # Check for curly braces and treat their contents as ARPAbet:
-  while len(text):
-    m = _curly_re.match(text)
-    if not m:
-      sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
-      break
-    sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
-    sequence += _arpabet_to_sequence(m.group(2))
-    text = m.group(3)
+    # Check for curly braces and treat their contents as ARPAbet:
+    while len(text):
+        m = _curly_re.match(text)
+        if not m:
+            sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
+            break
+        sequence += _symbols_to_sequence(
+            _clean_text(m.group(1), cleaner_names)
+        )
+        sequence += _arpabet_to_sequence(m.group(2))
+        text = m.group(3)

-  return sequence
+    return sequence


 def sequence_to_text(sequence):
-  '''Converts a sequence of IDs back to a string'''
-  result = ''
-  for symbol_id in sequence:
-    if symbol_id in _id_to_symbol:
-      s = _id_to_symbol[symbol_id]
-      # Enclose ARPAbet back in curly braces:
-      if len(s) > 1 and s[0] == '@':
-        s = '{%s}' % s[1:]
-      result += s
-  return result.replace('}{', ' ')
+    """Converts a sequence of IDs back to a string"""
+    result = ""
+    for symbol_id in sequence:
+        if symbol_id in _id_to_symbol:
+            s = _id_to_symbol[symbol_id]
+            # Enclose ARPAbet back in curly braces:
+            if len(s) > 1 and s[0] == "@":
+                s = "{%s}" % s[1:]
+            result += s
+    return result.replace("}{", " ")


 def _clean_text(text, cleaner_names):
-  for name in cleaner_names:
-    cleaner = getattr(cleaners, name)
-    if not cleaner:
-      raise Exception('Unknown cleaner: %s' % name)
-    text = cleaner(text)
-  return text
+    for name in cleaner_names:
+        cleaner = getattr(cleaners, name)
+        if not cleaner:
+            raise Exception("Unknown cleaner: %s" % name)
+        text = cleaner(text)
+    return text


 def _symbols_to_sequence(symbols):
-  return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
+    return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]


 def _arpabet_to_sequence(text):
-  return _symbols_to_sequence(['@' + s for s in text.split()])
+    return _symbols_to_sequence(["@" + s for s in text.split()])


 def _should_keep_symbol(s):
-  return s in _symbol_to_id and s is not '_' and s is not '~'
+    return s in _symbol_to_id and s != "_" and s != "~"
--- a/text/cleaners.py
+++ b/text/cleaners.py
@@ -1,90 +1,99 @@
-""" from https://github.com/keithito/tacotron """
-
-'''
-Cleaners are transformations that run over the input text at both training and eval time.
-
-Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
-hyperparameter. Some cleaners are English-specific. You'll typically want to use:
-  1. "english_cleaners" for English text
-  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
-     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
-  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
-     the symbols in symbols.py to match your data).
-'''
-
+# -*- coding: utf-8 -*-
 import re
 from unidecode import unidecode
 from .numbers import normalize_numbers

+""" from https://github.com/keithito/tacotron """
+"""
+Cleaners are transformations that run over the input text at both training and
+eval time.
+
+Cleaners can be selected by passing a comma-delimited list of cleaner names as
+the "cleaners"
+hyperparameter. Some cleaners are English-specific. You'll typically want to
+use:
+  1. "english_cleaners" for English text
+  2. "transliteration_cleaners" for non-English text that can be transliterated
+     to ASCII using
+     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
+  3. "basic_cleaners" if you do not want to transliterate (in this case, you
+     should also update
+     the symbols in symbols.py to match your data).
+"""

 # Regular expression matching whitespace:
-_whitespace_re = re.compile(r'\s+')
+_whitespace_re = re.compile(r"\s+")

 # List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
-  ('mrs', 'misess'),
-  ('mr', 'mister'),
-  ('dr', 'doctor'),
-  ('st', 'saint'),
-  ('co', 'company'),
-  ('jr', 'junior'),
-  ('maj', 'major'),
-  ('gen', 'general'),
-  ('drs', 'doctors'),
-  ('rev', 'reverend'),
-  ('lt', 'lieutenant'),
-  ('hon', 'honorable'),
-  ('sgt', 'sergeant'),
-  ('capt', 'captain'),
-  ('esq', 'esquire'),
-  ('ltd', 'limited'),
-  ('col', 'colonel'),
-  ('ft', 'fort'),
-]]
+_abbreviations = [
+    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("mrs", "misess"),
+        ("mr", "mister"),
+        ("dr", "doctor"),
+        ("st", "saint"),
+        ("co", "company"),
+        ("jr", "junior"),
+        ("maj", "major"),
+        ("gen", "general"),
+        ("drs", "doctors"),
+        ("rev", "reverend"),
+        ("lt", "lieutenant"),
+        ("hon", "honorable"),
+        ("sgt", "sergeant"),
+        ("capt", "captain"),
+        ("esq", "esquire"),
+        ("ltd", "limited"),
+        ("col", "colonel"),
+        ("ft", "fort"),
+    ]
+]


 def expand_abbreviations(text):
-  for regex, replacement in _abbreviations:
-    text = re.sub(regex, replacement, text)
-  return text
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text


 def expand_numbers(text):
-  return normalize_numbers(text)
+    return normalize_numbers(text)


 def lowercase(text):
-  return text.lower()
+    return text.lower()


 def collapse_whitespace(text):
-  return re.sub(_whitespace_re, ' ', text)
+    return re.sub(_whitespace_re, " ", text)


 def convert_to_ascii(text):
-  return unidecode(text)
+    return unidecode(text)


 def basic_cleaners(text):
-  '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
-  text = lowercase(text)
-  text = collapse_whitespace(text)
-  return text
+    """Basic pipeline that lowercases and collapses whitespace without
+    transliteration."""
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text


 def transliteration_cleaners(text):
-  '''Pipeline for non-English text that transliterates to ASCII.'''
-  text = convert_to_ascii(text)
-  text = lowercase(text)
-  text = collapse_whitespace(text)
-  return text
+    """Pipeline for non-English text that transliterates to ASCII."""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text


 def english_cleaners(text):
-  '''Pipeline for English text, including number and abbreviation expansion.'''
-  text = convert_to_ascii(text)
-  text = lowercase(text)
-  text = expand_numbers(text)
-  text = expand_abbreviations(text)
-  text = collapse_whitespace(text)
-  return text
+    """Pipeline for English text, including number and abbreviation
+    expansion."""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_numbers(text)
+    text = expand_abbreviations(text)
+    text = collapse_whitespace(text)
+    return text
--- a/text/cmudict.py
+++ b/text/cmudict.py
@@ -1,65 +1,143 @@
+# -*- coding: utf-8 -*-
 """ from https://github.com/keithito/tacotron """

 import re

-
 valid_symbols = [
-  'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
-  'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
-  'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
-  'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
-  'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
-  'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
-  'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
+    "AA",
+    "AA0",
+    "AA1",
+    "AA2",
+    "AE",
+    "AE0",
+    "AE1",
+    "AE2",
+    "AH",
+    "AH0",
+    "AH1",
+    "AH2",
+    "AO",
+    "AO0",
+    "AO1",
+    "AO2",
+    "AW",
+    "AW0",
+    "AW1",
+    "AW2",
+    "AY",
+    "AY0",
+    "AY1",
+    "AY2",
+    "B",
+    "CH",
+    "D",
+    "DH",
+    "EH",
+    "EH0",
+    "EH1",
+    "EH2",
+    "ER",
+    "ER0",
+    "ER1",
+    "ER2",
+    "EY",
+    "EY0",
+    "EY1",
+    "EY2",
+    "F",
+    "G",
+    "HH",
+    "IH",
+    "IH0",
+    "IH1",
+    "IH2",
+    "IY",
+    "IY0",
+    "IY1",
+    "IY2",
+    "JH",
+    "K",
+    "L",
+    "M",
+    "N",
+    "NG",
+    "OW",
+    "OW0",
+    "OW1",
+    "OW2",
+    "OY",
+    "OY0",
+    "OY1",
+    "OY2",
+    "P",
+    "R",
+    "S",
+    "SH",
+    "T",
+    "TH",
+    "UH",
+    "UH0",
+    "UH1",
+    "UH2",
+    "UW",
+    "UW0",
+    "UW1",
+    "UW2",
+    "V",
+    "W",
+    "Y",
+    "Z",
+    "ZH",
 ]

 _valid_symbol_set = set(valid_symbols)


 class CMUDict:
-  '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
-  def __init__(self, file_or_path, keep_ambiguous=True):
-    if isinstance(file_or_path, str):
-      with open(file_or_path, encoding='latin-1') as f:
-        entries = _parse_cmudict(f)
-    else:
-      entries = _parse_cmudict(file_or_path)
-    if not keep_ambiguous:
-      entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
-    self._entries = entries
+    """Thin wrapper around CMUDict data.
+    http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
+
+    def __init__(self, file_or_path, keep_ambiguous=True):
+        if isinstance(file_or_path, str):
+            with open(file_or_path, encoding="latin-1") as f:
+                entries = _parse_cmudict(f)
+        else:
+            entries = _parse_cmudict(file_or_path)
+        if not keep_ambiguous:
+            entries = {
+                word: pron for word, pron in entries.items() if len(pron) == 1
+            }
+        self._entries = entries
+
+    def __len__(self):
+        return len(self._entries)
+
+    def lookup(self, word):
+        """Returns list of ARPAbet pronunciations of the given word."""
+        return self._entries.get(word.upper())


-  def __len__(self):
-    return len(self._entries)
-
-
-  def lookup(self, word):
-    '''Returns list of ARPAbet pronunciations of the given word.'''
-    return self._entries.get(word.upper())
-
-
-
-_alt_re = re.compile(r'\([0-9]+\)')
+_alt_re = re.compile(r"\([0-9]+\)")


 def _parse_cmudict(file):
-  cmudict = {}
-  for line in file:
-    if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
-      parts = line.split('  ')
-      word = re.sub(_alt_re, '', parts[0])
-      pronunciation = _get_pronunciation(parts[1])
-      if pronunciation:
-        if word in cmudict:
-          cmudict[word].append(pronunciation)
-        else:
-          cmudict[word] = [pronunciation]
-  return cmudict
+    cmudict = {}
+    for line in file:
+        if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
+            parts = line.split("  ")
+            word = re.sub(_alt_re, "", parts[0])
+            pronunciation = _get_pronunciation(parts[1])
+            if pronunciation:
+                if word in cmudict:
+                    cmudict[word].append(pronunciation)
+                else:
+                    cmudict[word] = [pronunciation]
+    return cmudict


 def _get_pronunciation(s):
-  parts = s.strip().split(' ')
-  for part in parts:
-    if part not in _valid_symbol_set:
-      return None
-  return ' '.join(parts)
+    parts = s.strip().split(" ")
+    for part in parts:
+        if part not in _valid_symbol_set:
+            return None
+    return " ".join(parts)
--- a/text/numbers.py
+++ b/text/numbers.py
@@ -1,71 +1,73 @@
+# -*- coding: utf-8 -*-
 """ from https://github.com/keithito/tacotron """

 import inflect
 import re

-
 _inflect = inflect.engine()
-_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
-_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
-_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
-_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
-_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
-_number_re = re.compile(r'[0-9]+')
+_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
+_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
+_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
+_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
+_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
+_number_re = re.compile(r"[0-9]+")


 def _remove_commas(m):
-  return m.group(1).replace(',', '')
+    return m.group(1).replace(",", "")


 def _expand_decimal_point(m):
-  return m.group(1).replace('.', ' point ')
+    return m.group(1).replace(".", " point ")


 def _expand_dollars(m):
-  match = m.group(1)
-  parts = match.split('.')
-  if len(parts) > 2:
-    return match + ' dollars'  # Unexpected format
-  dollars = int(parts[0]) if parts[0] else 0
-  cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
-  if dollars and cents:
-    dollar_unit = 'dollar' if dollars == 1 else 'dollars'
-    cent_unit = 'cent' if cents == 1 else 'cents'
-    return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
-  elif dollars:
-    dollar_unit = 'dollar' if dollars == 1 else 'dollars'
-    return '%s %s' % (dollars, dollar_unit)
-  elif cents:
-    cent_unit = 'cent' if cents == 1 else 'cents'
-    return '%s %s' % (cents, cent_unit)
-  else:
-    return 'zero dollars'
+    match = m.group(1)
+    parts = match.split(".")
+    if len(parts) > 2:
+        return match + " dollars"  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        return "%s %s" % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s" % (cents, cent_unit)
+    else:
+        return "zero dollars"


 def _expand_ordinal(m):
-  return _inflect.number_to_words(m.group(0))
+    return _inflect.number_to_words(m.group(0))


 def _expand_number(m):
-  num = int(m.group(0))
-  if num > 1000 and num < 3000:
-    if num == 2000:
-      return 'two thousand'
-    elif num > 2000 and num < 2010:
-      return 'two thousand ' + _inflect.number_to_words(num % 100)
-    elif num % 100 == 0:
-      return _inflect.number_to_words(num // 100) + ' hundred'
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return "two thousand"
+        elif num > 2000 and num < 2010:
+            return "two thousand " + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + " hundred"
+        else:
+            return _inflect.number_to_words(
+                num, andword="", zero="oh", group=2
+            ).replace(", ", " ")
    else:
-      return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
-  else:
-    return _inflect.number_to_words(num, andword='')
+        return _inflect.number_to_words(num, andword="")


 def normalize_numbers(text):
-  text = re.sub(_comma_number_re, _remove_commas, text)
-  text = re.sub(_pounds_re, r'\1 pounds', text)
-  text = re.sub(_dollars_re, _expand_dollars, text)
-  text = re.sub(_decimal_number_re, _expand_decimal_point, text)
-  text = re.sub(_ordinal_re, _expand_ordinal, text)
-  text = re.sub(_number_re, _expand_number, text)
-  return text
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r"\1 pounds", text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text
--- a/text/symbols.py
+++ b/text/symbols.py
@@ -1,18 +1,24 @@
-""" from https://github.com/keithito/tacotron """
+# -*- coding: utf-8 -*-
+from . import cmudict

-'''
+""" from https://github.com/keithito/tacotron """
+"""
 Defines the set of symbols used in text input to the model.

-The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
-from text import cmudict
+The default is a set of ASCII characters that works well for English or text
+that has been run through Unidecode. For other data, you can modify
+_characters. See TRAINING_DATA.md for details. """

-_pad        = '_'
-_punctuation = '!\'(),.:;? '
-_special = '-'
-_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+_pad = "_"
+_punctuation = "!'(),.:;? "
+_special = "-"
+_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

-# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
-_arpabet = ['@' + s for s in cmudict.valid_symbols]
+# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as
+# uppercase letters):
+_arpabet = ["@" + s for s in cmudict.valid_symbols]

 # Export all symbols:
-symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet
+symbols = (
+    [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet
+)
--- a/train.py
+++ b/train.py
@@ -1,290 +0,0 @@
-import os
-import time
-import argparse
-import math
-from numpy import finfo
-
-import torch
-from distributed import apply_gradient_allreduce
-import torch.distributed as dist
-from torch.utils.data.distributed import DistributedSampler
-from torch.utils.data import DataLoader
-
-from model import Tacotron2
-from data_utils import TextMelLoader, TextMelCollate
-from loss_function import Tacotron2Loss
-from logger import Tacotron2Logger
-from hparams import create_hparams
-
-
-def reduce_tensor(tensor, n_gpus):
-    rt = tensor.clone()
-    dist.all_reduce(rt, op=dist.reduce_op.SUM)
-    rt /= n_gpus
-    return rt
-
-
-def init_distributed(hparams, n_gpus, rank, group_name):
-    assert torch.cuda.is_available(), "Distributed mode requires CUDA."
-    print("Initializing Distributed")
-
-    # Set cuda device so everything is done on the right GPU.
-    torch.cuda.set_device(rank % torch.cuda.device_count())
-
-    # Initialize distributed communication
-    dist.init_process_group(
-        backend=hparams.dist_backend, init_method=hparams.dist_url,
-        world_size=n_gpus, rank=rank, group_name=group_name)
-
-    print("Done initializing distributed")
-
-
-def prepare_dataloaders(hparams):
-    # Get data, data loaders and collate function ready
-    trainset = TextMelLoader(hparams.training_files, hparams)
-    valset = TextMelLoader(hparams.validation_files, hparams)
-    collate_fn = TextMelCollate(hparams.n_frames_per_step)
-
-    if hparams.distributed_run:
-        train_sampler = DistributedSampler(trainset)
-        shuffle = False
-    else:
-        train_sampler = None
-        shuffle = True
-
-    train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle,
-                              sampler=train_sampler,
-                              batch_size=hparams.batch_size, pin_memory=False,
-                              drop_last=True, collate_fn=collate_fn)
-    return train_loader, valset, collate_fn
-
-
-def prepare_directories_and_logger(output_directory, log_directory, rank):
-    if rank == 0:
-        if not os.path.isdir(output_directory):
-            os.makedirs(output_directory)
-            os.chmod(output_directory, 0o775)
-        logger = Tacotron2Logger(os.path.join(output_directory, log_directory))
-    else:
-        logger = None
-    return logger
-
-
-def load_model(hparams):
-    model = Tacotron2(hparams)
-    if hparams.fp16_run:
-        model.decoder.attention_layer.score_mask_value = finfo('float16').min
-
-    if hparams.distributed_run:
-        model = apply_gradient_allreduce(model)
-
-    return model
-
-
-def warm_start_model(checkpoint_path, model, ignore_layers):
-    assert os.path.isfile(checkpoint_path)
-    print("Warm starting model from checkpoint '{}'".format(checkpoint_path))
-    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
-    model_dict = checkpoint_dict['state_dict']
-    if len(ignore_layers) > 0:
-        model_dict = {k: v for k, v in model_dict.items()
-                      if k not in ignore_layers}
-        dummy_dict = model.state_dict()
-        dummy_dict.update(model_dict)
-        model_dict = dummy_dict
-    model.load_state_dict(model_dict)
-    return model
-
-
-def load_checkpoint(checkpoint_path, model, optimizer):
-    assert os.path.isfile(checkpoint_path)
-    print("Loading checkpoint '{}'".format(checkpoint_path))
-    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
-    model.load_state_dict(checkpoint_dict['state_dict'])
-    optimizer.load_state_dict(checkpoint_dict['optimizer'])
-    learning_rate = checkpoint_dict['learning_rate']
-    iteration = checkpoint_dict['iteration']
-    print("Loaded checkpoint '{}' from iteration {}" .format(
-        checkpoint_path, iteration))
-    return model, optimizer, learning_rate, iteration
-
-
-def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
-    print("Saving model and optimizer state at iteration {} to {}".format(
-        iteration, filepath))
-    torch.save({'iteration': iteration,
-                'state_dict': model.state_dict(),
-                'optimizer': optimizer.state_dict(),
-                'learning_rate': learning_rate}, filepath)
-
-
-def validate(model, criterion, valset, iteration, batch_size, n_gpus,
-             collate_fn, logger, distributed_run, rank):
-    """Handles all the validation scoring and printing"""
-    model.eval()
-    with torch.no_grad():
-        val_sampler = DistributedSampler(valset) if distributed_run else None
-        val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1,
-                                shuffle=False, batch_size=batch_size,
-                                pin_memory=False, collate_fn=collate_fn)
-
-        val_loss = 0.0
-        for i, batch in enumerate(val_loader):
-            x, y = model.parse_batch(batch)
-            y_pred = model(x)
-            loss = criterion(y_pred, y)
-            if distributed_run:
-                reduced_val_loss = reduce_tensor(loss.data, n_gpus).item()
-            else:
-                reduced_val_loss = loss.item()
-            val_loss += reduced_val_loss
-        val_loss = val_loss / (i + 1)
-
-    model.train()
-    if rank == 0:
-        print("Validation loss {}: {:9f}  ".format(iteration, reduced_val_loss))
-        logger.log_validation(reduced_val_loss, model, y, y_pred, iteration)
-
-
-def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
-          rank, group_name, hparams):
-    """Training and validation logging results to tensorboard and stdout
-
-    Params
-    ------
-    output_directory (string): directory to save checkpoints
-    log_directory (string) directory to save tensorboard logs
-    checkpoint_path(string): checkpoint path
-    n_gpus (int): number of gpus
-    rank (int): rank of current gpu
-    hparams (object): comma separated list of "name=value" pairs.
-    """
-    if hparams.distributed_run:
-        init_distributed(hparams, n_gpus, rank, group_name)
-
-    torch.manual_seed(hparams.seed)
-    torch.cuda.manual_seed(hparams.seed)
-
-    model = load_model(hparams)
-    learning_rate = hparams.learning_rate
-    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
-                                 weight_decay=hparams.weight_decay)
-
-    if hparams.fp16_run:
-        from apex import amp
-        model, optimizer = amp.initialize(
-            model, optimizer, opt_level='O2')
-
-    if hparams.distributed_run:
-        model = apply_gradient_allreduce(model)
-
-    criterion = Tacotron2Loss()
-
-    logger = prepare_directories_and_logger(
-        output_directory, log_directory, rank)
-
-    train_loader, valset, collate_fn = prepare_dataloaders(hparams)
-
-    # Load checkpoint if one exists
-    iteration = 0
-    epoch_offset = 0
-    if checkpoint_path is not None:
-        if warm_start:
-            model = warm_start_model(
-                checkpoint_path, model, hparams.ignore_layers)
-        else:
-            model, optimizer, _learning_rate, iteration = load_checkpoint(
-                checkpoint_path, model, optimizer)
-            if hparams.use_saved_learning_rate:
-                learning_rate = _learning_rate
-            iteration += 1  # next iteration is iteration + 1
-            epoch_offset = max(0, int(iteration / len(train_loader)))
-
-    model.train()
-    is_overflow = False
-    # ================ MAIN TRAINNIG LOOP! ===================
-    for epoch in range(epoch_offset, hparams.epochs):
-        print("Epoch: {}".format(epoch))
-        for i, batch in enumerate(train_loader):
-            start = time.perf_counter()
-            for param_group in optimizer.param_groups:
-                param_group['lr'] = learning_rate
-
-            model.zero_grad()
-            x, y = model.parse_batch(batch)
-            y_pred = model(x)
-
-            loss = criterion(y_pred, y)
-            if hparams.distributed_run:
-                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
-            else:
-                reduced_loss = loss.item()
-            if hparams.fp16_run:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            if hparams.fp16_run:
-                grad_norm = torch.nn.utils.clip_grad_norm_(
-                    amp.master_params(optimizer), hparams.grad_clip_thresh)
-                is_overflow = math.isnan(grad_norm)
-            else:
-                grad_norm = torch.nn.utils.clip_grad_norm_(
-                    model.parameters(), hparams.grad_clip_thresh)
-
-            optimizer.step()
-
-            if not is_overflow and rank == 0:
-                duration = time.perf_counter() - start
-                print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
-                    iteration, reduced_loss, grad_norm, duration))
-                logger.log_training(
-                    reduced_loss, grad_norm, learning_rate, duration, iteration)
-
-            if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0):
-                validate(model, criterion, valset, iteration,
-                         hparams.batch_size, n_gpus, collate_fn, logger,
-                         hparams.distributed_run, rank)
-                if rank == 0:
-                    checkpoint_path = os.path.join(
-                        output_directory, "checkpoint_{}".format(iteration))
-                    save_checkpoint(model, optimizer, learning_rate, iteration,
-                                    checkpoint_path)
-
-            iteration += 1
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-o', '--output_directory', type=str,
-                        help='directory to save checkpoints')
-    parser.add_argument('-l', '--log_directory', type=str,
-                        help='directory to save tensorboard logs')
-    parser.add_argument('-c', '--checkpoint_path', type=str, default=None,
-                        required=False, help='checkpoint path')
-    parser.add_argument('--warm_start', action='store_true',
-                        help='load model weights only, ignore specified layers')
-    parser.add_argument('--n_gpus', type=int, default=1,
-                        required=False, help='number of gpus')
-    parser.add_argument('--rank', type=int, default=0,
-                        required=False, help='rank of current gpu')
-    parser.add_argument('--group_name', type=str, default='group_name',
-                        required=False, help='Distributed group name')
-    parser.add_argument('--hparams', type=str,
-                        required=False, help='comma separated name=value pairs')
-
-    args = parser.parse_args()
-    hparams = create_hparams(args.hparams)
-
-    torch.backends.cudnn.enabled = hparams.cudnn_enabled
-    torch.backends.cudnn.benchmark = hparams.cudnn_benchmark
-
-    print("FP16 Run:", hparams.fp16_run)
-    print("Dynamic Loss Scaling:", hparams.dynamic_loss_scaling)
-    print("Distributed Run:", hparams.distributed_run)
-    print("cuDNN Enabled:", hparams.cudnn_enabled)
-    print("cuDNN Benchmark:", hparams.cudnn_benchmark)
-
-    train(args.output_directory, args.log_directory, args.checkpoint_path,
-          args.warm_start, args.n_gpus, args.rank, args.group_name, hparams)
--- a/tts.py
+++ b/tts.py
@@ -1,177 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-
-import sys
-import numpy as np
-import torch
-from hparams import create_hparams
-from model import Tacotron2
-from train import load_model
-from text import text_to_sequence
-import os
-import soundfile as sf
-import pyaudio
-import klepto
-from librosa import resample
-from librosa.effects import time_stretch
-from sia.file_utils import cached_model_path
-from sia.instruments import do_time
-from glow import WaveGlow
-
-TTS_SAMPLE_RATE = 22050
-OUTPUT_SAMPLE_RATE = 16000
-
-# https://github.com/NVIDIA/waveglow/blob/master/config.json
-WAVEGLOW_CONFIG = {
-    "n_mel_channels": 80,
-    "n_flows": 12,
-    "n_group": 8,
-    "n_early_every": 4,
-    "n_early_size": 2,
-    "WN_config": {
-        "n_layers": 8,
-        "n_channels": 256,
-        "kernel_size": 3
-    }
-}
-
-
-class TTSModel(object):
-    """docstring for TTSModel."""
-
-    def __init__(self):
-        super(TTSModel, self).__init__()
-        hparams = create_hparams()
-        hparams.sampling_rate = TTS_SAMPLE_RATE
-        self.model = load_model(hparams)
-        tacotron2_path = cached_model_path("tacotron2_model")
-        self.model.load_state_dict(
-            torch.load(tacotron2_path, map_location='cpu')['state_dict'])
-        self.model.eval()
-        waveglow_path = cached_model_path('waveglow_model')
-        self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
-        wave_params = torch.load(waveglow_path, map_location='cpu')
-        self.waveglow.load_state_dict(wave_params)
-        self.waveglow.eval()
-        for k in self.waveglow.convinv:
-            k.float()
-        self.k_cache = klepto.archives.file_archive(cached=False)
-        self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
-            self.synth_speech)
-
-        # https://github.com/NVIDIA/waveglow/issues/127
-        for m in self.waveglow.modules():
-            if 'Conv' in str(type(m)):
-                setattr(m, 'padding_mode', 'zeros')
-
-    @do_time
-    def synth_speech(self, t):
-        text = t
-        sequence = np.array(text_to_sequence(text,
-                                             ['english_cleaners']))[None, :]
-        sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
-        mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
-            sequence)
-        with torch.no_grad():
-            audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
-        audio = audio_t[0].data.cpu().numpy()
-        # data = convert(audio)
-        slow_data = time_stretch(audio, 0.8)
-        float_data = resample(slow_data, TTS_SAMPLE_RATE, OUTPUT_SAMPLE_RATE)
-        data = float2pcm(float_data)
-        return data.tobytes()
-
-
-def convert(array):
-    sf.write('sample.wav', array, TTS_SAMPLE_RATE)
-    # convert to $OUTPUT_SAMPLE_RATE
-    os.system('ffmpeg -i {0} -filter:a "atempo=0.80" -ar 16k {1}'.format(
-        'sample.wav', 'sample0.wav'))
-    data, rate = sf.read('sample0.wav', dtype='int16')
-    os.remove('sample.wav')
-    os.remove('sample0.wav')
-    return data
-
-
-# https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
-def float2pcm(sig, dtype='int16'):
-    """Convert floating point signal with a range from -1 to 1 to PCM.
-    Any signal values outside the interval [-1.0, 1.0) are clipped.
-    No dithering is used.
-    Note that there are different possibilities for scaling floating
-    point numbers to PCM numbers, this function implements just one of
-    them.  For an overview of alternatives see
-    http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html
-    Parameters
-    ----------
-    sig : array_like
-        Input array, must have floating point type.
-    dtype : data type, optional
-        Desired (integer) data type.
-    Returns
-    -------
-    numpy.ndarray
-        Integer data, scaled and clipped to the range of the given
-        *dtype*.
-    See Also
-    --------
-    pcm2float, dtype
-    """
-    sig = np.asarray(sig)
-    if sig.dtype.kind != 'f':
-        raise TypeError("'sig' must be a float array")
-    dtype = np.dtype(dtype)
-    if dtype.kind not in 'iu':
-        raise TypeError("'dtype' must be an integer type")
-
-    i = np.iinfo(dtype)
-    abs_max = 2**(i.bits - 1)
-    offset = i.min + abs_max
-    return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
-
-
-def display(data):
-    import IPython.display as ipd
-    aud = ipd.Audio(data, rate=16000)
-    return aud
-
-
-def player_gen():
-    audio_interface = pyaudio.PyAudio()
-    _audio_stream = audio_interface.open(format=pyaudio.paInt16,
-                                         channels=1,
-                                         rate=OUTPUT_SAMPLE_RATE,
-                                         output=True)
-
-    def play_device(data):
-        _audio_stream.write(data)
-        # _audio_stream.close()
-
-    return play_device
-
-
-def synthesize_corpus():
-    tts_model = TTSModel()
-    all_data = []
-    for (i, line) in enumerate(open('corpus.txt').readlines()):
-        print('synthesizing... "{}"'.format(line.strip()))
-        data = tts_model.synth_speech(line.strip())
-        all_data.append(data)
-    return all_data
-
-
-def play_corpus(corpus_synths):
-    player = player_gen()
-    for d in corpus_synths:
-        player(d)
-
-
-def main():
-    corpus_synth_data = synthesize_corpus()
-    play_corpus(corpus_synth_data)
-    import ipdb
-    ipdb.set_trace()
-
-
-if __name__ == '__main__':
-    main()
--- a/utils.py
+++ b/utils.py
@@ -1,29 +0,0 @@
-import numpy as np
-from scipy.io.wavfile import read
-import torch
-
-
-def get_mask_from_lengths(lengths):
-    max_len = torch.max(lengths).item()
-    ids = torch.arange(0, max_len, out=torch.LongTensor(max_len))   #initially out = torch.LongTensor(max_len)
-    mask = (ids < lengths.unsqueeze(1)).byte()
-    return mask
-
-
-def load_wav_to_torch(full_path):
-    sampling_rate, data = read(full_path)
-    return torch.FloatTensor(data.astype(np.float32)), sampling_rate
-
-
-def load_filepaths_and_text(filename, split="|"):
-    with open(filename, encoding='utf-8') as f:
-        filepaths_and_text = [line.strip().split(split) for line in f]
-    return filepaths_and_text
-
-
-def to_gpu(x):
-    x = x.contiguous()
-
-    #if torch.cuda.is_available():       #initially not commented out
-     #   x = x.cuda(non_blocking=True)    # initially not commented out
-    return torch.autograd.Variable(x)