Dockerfile: adding dockerfile

2026-03-09 02:02:33 +00:00 · 2018-05-04 09:39:05 -07:00
18 changed files with 178 additions and 247 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
 [submodule "waveglow"]
 	path = waveglow
 	url = https://github.com/NVIDIA/waveglow
--- a/3
+++ b/3
@@ -1,2 +1,3 @@
 FROM pytorch/pytorch:0.4_cuda9_cudnn7
-RUN pip install numpy scipy matplotlib librosa==0.6.0 tensorflow tensorboardX inflect==0.2.5 Unidecode==1.0.22 jupyter
+
 RUN pip install numpy scipy matplotlib librosa==0.6.0 tensorflow tensorboardX inflect==0.2.5 Unidecode==1.0.22
--- a/README.md
+++ b/README.md
@@ -11,9 +11,6 @@ Distributed and FP16 support relies on work by Christian Sarofeen and NVIDIA's
 ![Alignment, Predicted Mel Spectrogram, Target Mel Spectrogram](tensorboard.png)
 [Download demo audio](https://github.com/NVIDIA/tacotron2/blob/master/demo.wav) trained on LJS and using Ryuchi Yamamoto's [pre-trained Mixture of Logistics
 wavenet](https://github.com/r9y9/wavenet_vocoder/)  
 "Scientists at the CERN laboratory say they have discovered a new particle."
 ## Pre-requisites
 1. NVIDIA GPU + CUDA cuDNN
@@ -23,12 +20,11 @@ wavenet](https://github.com/r9y9/wavenet_vocoder/)
 2. Clone this repo: `git clone https://github.com/NVIDIA/tacotron2.git`
 3. CD into this repo: `cd tacotron2`
 4. Update .wav paths: `sed -i -- 's,DUMMY,ljs_dataset_folder/wavs,g' filelists/*.txt`
    - Alternatively, set `load_mel_from_disk=True` in `hparams.py` and update mel-spectrogram paths 
 5. Install [pytorch 0.4](https://github.com/pytorch/pytorch)
-6. Install python requirements or build docker image 
+6. Install python requirements or use docker container (tbd)
-    - Install python requirements: `pip install -r requirements.txt`
+    - Install python requirements: `pip install requirements.txt`
    - **OR**
-    - Build docker image: `docker build --tag tacotron2 .` 
+    - Docker container `(tbd)` 
 ## Training
 1. `python train.py --output_directory=outdir --log_directory=logdir`
@@ -38,13 +34,9 @@ wavenet](https://github.com/r9y9/wavenet_vocoder/)
 1. `python -m multiproc train.py --output_directory=outdir --log_directory=logdir --hparams=distributed_run=True,fp16_run=True`
 ## Inference
 When performing Mel-Spectrogram to Audio synthesis with a WaveNet model, make sure Tacotron 2 and WaveNet were trained on the same mel-spectrogram representation. Follow these steps to use a a simple inference pipeline using griffin-lim:
 1. `jupyter notebook --ip=127.0.0.1 --port=31337`
 2. load inference.ipynb 
 ## Related repos
 [nv-wavenet](https://github.com/NVIDIA/nv-wavenet/): Faster than real-time
 wavenet inference
--- a/data_utils.py
+++ b/data_utils.py
@@ -1,5 +1,4 @@
 import random
 import numpy as np
 import torch
 import torch.utils.data
@@ -14,17 +13,18 @@ class TextMelLoader(torch.utils.data.Dataset):
        2) normalizes text and converts them to sequences of one-hot vectors
        3) computes mel-spectrograms from audio files.
    """
-    def __init__(self, audiopaths_and_text, hparams):
+    def __init__(self, audiopaths_and_text, hparams, shuffle=True):
-        self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
+        self.audiopaths_and_text = load_filepaths_and_text(
            audiopaths_and_text, hparams.sort_by_length)
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.load_mel_from_disk = hparams.load_mel_from_disk
        self.stft = layers.TacotronSTFT(
            hparams.filter_length, hparams.hop_length, hparams.win_length,
            hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
            hparams.mel_fmax)
        random.seed(1234)
        if shuffle:
            random.shuffle(self.audiopaths_and_text)
    def get_mel_text_pair(self, audiopath_and_text):
@@ -35,22 +35,12 @@ class TextMelLoader(torch.utils.data.Dataset):
        return (text, mel)
    def get_mel(self, filename):
-        if not self.load_mel_from_disk:
+        audio = load_wav_to_torch(filename, self.sampling_rate)
            audio, sampling_rate = load_wav_to_torch(filename)
            if sampling_rate != self.stft.sampling_rate:
                raise ValueError("{} {} SR doesn't match target {} SR".format(
                    sampling_rate, self.stft.sampling_rate))
        audio_norm = audio / self.max_wav_value
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        else:
            melspec = torch.from_numpy(np.load(filename))
            assert melspec.size(0) == self.stft.n_mel_channels, (
                'Mel dimension mismatch: given {}, expected {}'.format(
                    melspec.size(0), self.stft.n_mel_channels))
        return melspec
    def get_text(self, text):
@@ -88,9 +78,9 @@ class TextMelCollate():
            text = batch[ids_sorted_decreasing[i]][0]
            text_padded[i, :text.size(0)] = text
-        # Right zero-pad mel-spec
+        # Right zero-pad mel-spec with extra single zero vector to mark the end
        num_mels = batch[0][1].size(0)
-        max_target_len = max([x[1].size(1) for x in batch])
+        max_target_len = max([x[1].size(1) for x in batch]) + 1
        if max_target_len % self.n_frames_per_step != 0:
            max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
            assert max_target_len % self.n_frames_per_step == 0
@@ -104,7 +94,7 @@ class TextMelCollate():
        for i in range(len(ids_sorted_decreasing)):
            mel = batch[ids_sorted_decreasing[i]][1]
            mel_padded[i, :, :mel.size(1)] = mel
-            gate_padded[i, mel.size(1)-1:] = 1
+            gate_padded[i, mel.size(1):] = 1
            output_lengths[i] = mel.size(1)
        return text_padded, input_lengths, mel_padded, gate_padded, \
--- a/demo.wav
+++ b/demo.wav
--- a/distributed.py
+++ b/distributed.py
@@ -118,55 +118,3 @@ class DistributedDataParallel(Module):
        super(DistributedDataParallel, self).train(mode)
        self.module.train(mode)
    '''
 '''
 Modifies existing model to do gradient allreduce, but doesn't change class
 so you don't need "module"
 '''
 def apply_gradient_allreduce(module):
        if not hasattr(dist, '_backend'):
            module.warn_on_half = True
        else:
            module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
        for p in module.state_dict().values():
            if not torch.is_tensor(p):
                continue
            dist.broadcast(p, 0)
        def allreduce_params():
            if(module.needs_reduction):
                module.needs_reduction = False
                buckets = {}
                for param in module.parameters():
                    if param.requires_grad and param.grad is not None:
                        tp = type(param.data)
                        if tp not in buckets:
                            buckets[tp] = []
                        buckets[tp].append(param)
                if module.warn_on_half:
                    if torch.cuda.HalfTensor in buckets:
                        print("WARNING: gloo dist backend for half parameters may be extremely slow." +
                              " It is recommended to use the NCCL backend in this case. This currently requires" +
                              "PyTorch built from top of tree master.")
                        module.warn_on_half = False
                for tp in buckets:
                    bucket = buckets[tp]
                    grads = [param.grad.data for param in bucket]
                    coalesced = _flatten_dense_tensors(grads)
                    dist.all_reduce(coalesced)
                    coalesced /= dist.get_world_size()
                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
                        buf.copy_(synced)
        for param in list(module.parameters()):
            def allreduce_hook(*unused):
                param._execution_engine.queue_callback(allreduce_params)
            if param.requires_grad:
                param.register_hook(allreduce_hook)
        def set_needs_reduction(self, input, output):
            self.needs_reduction = True
        module.register_forward_hook(set_needs_reduction)
        return module
--- a/hparams.py
+++ b/hparams.py
@@ -10,7 +10,7 @@ def create_hparams(hparams_string=None, verbose=False):
        # Experiment Parameters        #
        ################################
        epochs=500,
-        iters_per_checkpoint=1000,
+        iters_per_checkpoint=500,
        seed=1234,
        dynamic_loss_scaling=True,
        fp16_run=False,
@@ -23,10 +23,10 @@ def create_hparams(hparams_string=None, verbose=False):
        ################################
        # Data Parameters             #
        ################################
-        load_mel_from_disk=False,
+        training_files='filelists/ljs_audio_text_train_filelist.txt',
-        training_files='filelists/ljs_audio22khz_text_train_filelist.txt',
+        validation_files='filelists/ljs_audio_text_val_filelist.txt',
        validation_files='filelists/ljs_audio22khz_text_val_filelist.txt',
        text_cleaners=['english_cleaners'],
        sort_by_length=False,
        ################################
        # Audio Parameters             #
@@ -38,7 +38,7 @@ def create_hparams(hparams_string=None, verbose=False):
        win_length=1024,
        n_mel_channels=80,
        mel_fmin=0.0,
-        mel_fmax=8000.0,
+        mel_fmax=None,  # if None, half the sampling rate
        ################################
        # Model Parameters             #
@@ -52,13 +52,11 @@ def create_hparams(hparams_string=None, verbose=False):
        encoder_embedding_dim=512,
        # Decoder parameters
-        n_frames_per_step=1,  # currently only 1 is supported
+        n_frames_per_step=1,
        decoder_rnn_dim=1024,
        prenet_dim=256,
        max_decoder_steps=1000,
-        gate_threshold=0.5,
+        gate_threshold=0.6,
        p_attention_dropout=0.1,
        p_decoder_dropout=0.1,
        # Attention parameters
        attention_rnn_dim=1024,
@@ -76,12 +74,11 @@ def create_hparams(hparams_string=None, verbose=False):
        ################################
        # Optimization Hyperparameters #
        ################################
        use_saved_learning_rate=False,
        learning_rate=1e-3,
        weight_decay=1e-6,
-        grad_clip_thresh=1.0,
+        grad_clip_thresh=1,
-        batch_size=64,
+        batch_size=48,
-        mask_padding=True  # set model's padded outputs to padded values
+        mask_padding=False  # set model's padded outputs to padded values
    )
    if hparams_string:
--- a/inference.ipynb
+++ b/inference.ipynb
--- a/layers.py
+++ b/layers.py
@@ -10,7 +10,7 @@ class LinearNorm(torch.nn.Module):
        super(LinearNorm, self).__init__()
        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
-        torch.nn.init.xavier_uniform_(
+        torch.nn.init.xavier_uniform(
            self.linear_layer.weight,
            gain=torch.nn.init.calculate_gain(w_init_gain))
@@ -31,7 +31,7 @@ class ConvNorm(torch.nn.Module):
                                    padding=padding, dilation=dilation,
                                    bias=bias)
-        torch.nn.init.xavier_uniform_(
+        torch.nn.init.xavier_uniform(
            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
    def forward(self, signal):
@@ -42,7 +42,7 @@ class ConvNorm(torch.nn.Module):
 class TacotronSTFT(torch.nn.Module):
    def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
                 n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
-                 mel_fmax=8000.0):
+                 mel_fmax=None):
        super(TacotronSTFT, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
--- a/loss_scaler.py
+++ b/loss_scaler.py
@@ -51,10 +51,11 @@ class DynamicLossScaler:
    # `x` is a torch.Tensor
    def _has_inf_or_nan(x):
-        cpu_sum = float(x.float().sum())
+        inf_count = torch.sum(x.abs() == float('inf'))
-        if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+        if inf_count > 0:
            return True
-        return False
+        nan_count = torch.sum(x != x)
        return nan_count > 0
    # `overflow` is boolean indicating whether we overflowed in gradient
    def update_scale(self, overflow):
--- a/model.py
+++ b/model.py
@@ -1,4 +1,3 @@
 from math import sqrt
 import torch
 from torch.autograd import Variable
 from torch import nn
@@ -57,7 +56,7 @@ class Attention(nn.Module):
        processed_query = self.query_layer(query.unsqueeze(1))
        processed_attention_weights = self.location_layer(attention_weights_cat)
-        energies = self.v(torch.tanh(
+        energies = self.v(F.tanh(
            processed_query + processed_attention_weights + processed_memory))
        energies = energies.squeeze(-1)
@@ -108,6 +107,7 @@ class Postnet(nn.Module):
    def __init__(self, hparams):
        super(Postnet, self).__init__()
        self.dropout = nn.Dropout(0.5)
        self.convolutions = nn.ModuleList()
        self.convolutions.append(
@@ -141,8 +141,9 @@ class Postnet(nn.Module):
    def forward(self, x):
        for i in range(len(self.convolutions) - 1):
-            x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training)
+            x = self.dropout(F.tanh(self.convolutions[i](x)))
-        x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
+
        x = self.dropout(self.convolutions[-1](x))
        return x
@@ -154,6 +155,7 @@ class Encoder(nn.Module):
    """
    def __init__(self, hparams):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(0.5)
        convolutions = []
        for _ in range(hparams.encoder_n_convolutions):
@@ -173,7 +175,7 @@ class Encoder(nn.Module):
    def forward(self, x, input_lengths):
        for conv in self.convolutions:
-            x = F.dropout(F.relu(conv(x)), 0.5, self.training)
+            x = self.dropout(F.relu(conv(x)))
        x = x.transpose(1, 2)
@@ -192,7 +194,7 @@ class Encoder(nn.Module):
    def inference(self, x):
        for conv in self.convolutions:
-            x = F.dropout(F.relu(conv(x)), 0.5, self.training)
+            x = self.dropout(F.relu(conv(x)))
        x = x.transpose(1, 2)
@@ -213,8 +215,6 @@ class Decoder(nn.Module):
        self.prenet_dim = hparams.prenet_dim
        self.max_decoder_steps = hparams.max_decoder_steps
        self.gate_threshold = hparams.gate_threshold
        self.p_attention_dropout = hparams.p_attention_dropout
        self.p_decoder_dropout = hparams.p_decoder_dropout
        self.prenet = Prenet(
            hparams.n_mel_channels * hparams.n_frames_per_step,
@@ -235,7 +235,7 @@ class Decoder(nn.Module):
        self.linear_projection = LinearNorm(
            hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
-            hparams.n_mel_channels * hparams.n_frames_per_step)
+            hparams.n_mel_channels*hparams.n_frames_per_step)
        self.gate_layer = LinearNorm(
            hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1,
@@ -350,13 +350,11 @@ class Decoder(nn.Module):
        gate_output: gate output energies
        attention_weights:
        """
        decoder_input = self.prenet(decoder_input)
        cell_input = torch.cat((decoder_input, self.attention_context), -1)
        self.attention_hidden, self.attention_cell = self.attention_rnn(
            cell_input, (self.attention_hidden, self.attention_cell))
        self.attention_hidden = F.dropout(
            self.attention_hidden, self.p_attention_dropout, self.training)
        self.attention_cell = F.dropout(
            self.attention_cell, self.p_attention_dropout, self.training)
        attention_weights_cat = torch.cat(
            (self.attention_weights.unsqueeze(1),
@@ -370,10 +368,6 @@ class Decoder(nn.Module):
            (self.attention_hidden, self.attention_context), -1)
        self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
            decoder_input, (self.decoder_hidden, self.decoder_cell))
        self.decoder_hidden = F.dropout(
            self.decoder_hidden, self.p_decoder_dropout, self.training)
        self.decoder_cell = F.dropout(
            self.decoder_cell, self.p_decoder_dropout, self.training)
        decoder_hidden_attention_context = torch.cat(
            (self.decoder_hidden, self.attention_context), dim=1)
@@ -398,23 +392,23 @@ class Decoder(nn.Module):
        alignments: sequence of attention weights from the decoder
        """
-        decoder_input = self.get_go_frame(memory).unsqueeze(0)
+        decoder_input = self.get_go_frame(memory)
        decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
        decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
        decoder_inputs = self.prenet(decoder_inputs)
        self.initialize_decoder_states(
            memory, mask=~get_mask_from_lengths(memory_lengths))
        mel_outputs, gate_outputs, alignments = [], [], []
-        while len(mel_outputs) < decoder_inputs.size(0) - 1:
+
-            decoder_input = decoder_inputs[len(mel_outputs)]
+        while len(mel_outputs) < decoder_inputs.size(0):
            mel_output, gate_output, attention_weights = self.decode(
                decoder_input)
            mel_outputs += [mel_output.squeeze(1)]
            gate_outputs += [gate_output.squeeze()]
            alignments += [attention_weights]
            decoder_input = decoder_inputs[len(mel_outputs) - 1]
        mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
            mel_outputs, gate_outputs, alignments)
@@ -437,15 +431,15 @@ class Decoder(nn.Module):
        self.initialize_decoder_states(memory, mask=None)
        mel_outputs, gate_outputs, alignments = [], [], []
        while True:
            decoder_input = self.prenet(decoder_input)
            mel_output, gate_output, alignment = self.decode(decoder_input)
            mel_outputs += [mel_output.squeeze(1)]
-            gate_outputs += [gate_output]
+            gate_outputs += [gate_output.squeeze()]
            alignments += [alignment]
-            if torch.sigmoid(gate_output.data) > self.gate_threshold:
+            if F.sigmoid(gate_output.data) > self.gate_threshold:
                break
            elif len(mel_outputs) == self.max_decoder_steps:
                print("Warning! Reached max decoder steps")
@@ -468,9 +462,6 @@ class Tacotron2(nn.Module):
        self.n_frames_per_step = hparams.n_frames_per_step
        self.embedding = nn.Embedding(
            hparams.n_symbols, hparams.symbols_embedding_dim)
        std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))
        val = sqrt(3.0) * std  # uniform bounds for std
        self.embedding.weight.data.uniform_(-val, val)
        self.encoder = Encoder(hparams)
        self.decoder = Decoder(hparams)
        self.postnet = Postnet(hparams)
@@ -480,7 +471,7 @@ class Tacotron2(nn.Module):
            output_lengths = batch
        text_padded = to_gpu(text_padded).long()
        input_lengths = to_gpu(input_lengths).long()
-        max_len = torch.max(input_lengths.data).item()
+        max_len = torch.max(input_lengths.data)
        mel_padded = to_gpu(mel_padded).float()
        gate_padded = to_gpu(gate_padded).float()
        output_lengths = to_gpu(output_lengths).long()
@@ -495,7 +486,7 @@ class Tacotron2(nn.Module):
    def parse_output(self, outputs, output_lengths=None):
        if self.mask_padding and output_lengths is not None:
-            mask = ~get_mask_from_lengths(output_lengths)
+            mask = ~get_mask_from_lengths(output_lengths+1)  # +1 <stop> token
            mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
            mask = mask.permute(1, 0, 2)
@@ -504,6 +495,7 @@ class Tacotron2(nn.Module):
            outputs[2].data.masked_fill_(mask[:, 0, :], 1e3)  # gate energies
        outputs = fp16_to_fp32(outputs) if self.fp16_run else outputs
        return outputs
    def forward(self, inputs):
@@ -521,6 +513,14 @@ class Tacotron2(nn.Module):
        mel_outputs_postnet = self.postnet(mel_outputs)
        mel_outputs_postnet = mel_outputs + mel_outputs_postnet
        # DataParallel expects equal sized inputs/outputs, hence padding
        if input_lengths is not None:
            alignments = alignments.unsqueeze(0)
            alignments = nn.functional.pad(
                alignments,
                (0, max_len - alignments.size(3), 0, 0),
                "constant", 0)
            alignments = alignments.squeeze()
        return self.parse_output(
            [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
            output_lengths)
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,9 @@
-torch==0.4.0
+torch==0.2.0.post3
 matplotlib==2.1.0
-tensorflow
+tensorflow==1.5.0
 numpy==1.13.3
 inflect==0.2.5
 librosa==0.6.0
 scipy==1.0.0
 tensorboardX==1.1
 Unidecode==1.0.22
 pillow
--- a/stft.py
+++ b/stft.py
@@ -61,7 +61,7 @@ class STFT(torch.nn.Module):
            np.linalg.pinv(scale * fourier_basis).T[:, None, :])
        if window is not None:
-            assert(filter_length >= win_length)
+            assert(win_length >= filter_length)
            # get window and zero center pad it to filter_length
            fft_window = get_window(window, win_length, fftbins=True)
            fft_window = pad_center(fft_window, filter_length)
--- a/text/init.py
+++ b/text/init.py
@@ -37,6 +37,8 @@ def text_to_sequence(text, cleaner_names):
    sequence += _arpabet_to_sequence(m.group(2))
    text = m.group(3)
  # Append EOS token
  sequence.append(_symbol_to_id['~'])
  return sequence
--- a/text/symbols.py
+++ b/text/symbols.py
@@ -7,12 +7,11 @@ The default is a set of ASCII characters that works well for English or text tha
 from text import cmudict
 _pad        = '_'
-_punctuation = '!\'(),.:;? '
+_eos        = '~'
-_special = '-'
+_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
 _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
 # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
 _arpabet = ['@' + s for s in cmudict.valid_symbols]
 # Export all symbols:
-symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet
+symbols = [_pad, _eos] + list(_characters) + _arpabet
--- a/train.py
+++ b/train.py
@@ -2,12 +2,11 @@ import os
 import time
 import argparse
 import math
 from numpy import finfo
 import torch
-from distributed import apply_gradient_allreduce
+from distributed import DistributedDataParallel
 import torch.distributed as dist
 from torch.utils.data.distributed import DistributedSampler
 from torch.nn import DataParallel
 from torch.utils.data import DataLoader
 from fp16_optimizer import FP16_Optimizer
@@ -30,20 +29,19 @@ def batchnorm_to_float(module):
 def reduce_tensor(tensor, num_gpus):
    rt = tensor.clone()
-    dist.all_reduce(rt, op=dist.reduce_op.SUM)
+    torch.distributed.all_reduce(rt, op=torch.distributed.reduce_op.SUM)
    rt /= num_gpus
    return rt
 def init_distributed(hparams, n_gpus, rank, group_name):
    assert torch.cuda.is_available(), "Distributed mode requires CUDA."
-    print("Initializing Distributed")
+    print("Initializing distributed")
    # Set cuda device so everything is done on the right GPU.
    torch.cuda.set_device(rank % torch.cuda.device_count())
    # Initialize distributed communication
-    dist.init_process_group(
+    torch.distributed.init_process_group(
        backend=hparams.dist_backend, init_method=hparams.dist_url,
        world_size=n_gpus, rank=rank, group_name=group_name)
@@ -79,9 +77,7 @@ def prepare_directories_and_logger(output_directory, log_directory, rank):
 def load_model(hparams):
    model = Tacotron2(hparams).cuda()
-    if hparams.fp16_run:
+    model = batchnorm_to_float(model.half()) if hparams.fp16_run else model
        model = batchnorm_to_float(model.half())
        model.decoder.attention_layer.score_mask_value = float(finfo('float16').min)
    if hparams.distributed_run:
        model = DistributedDataParallel(model)
@@ -132,20 +128,22 @@ def validate(model, criterion, valset, iteration, batch_size, n_gpus,
                                pin_memory=False, collate_fn=collate_fn)
        val_loss = 0.0
        if distributed_run or torch.cuda.device_count() > 1:
            batch_parser = model.module.parse_batch
        else:
            batch_parser = model.parse_batch
        for i, batch in enumerate(val_loader):
-            x, y = model.parse_batch(batch)
+            x, y = batch_parser(batch)
            y_pred = model(x)
            loss = criterion(y_pred, y)
-            if distributed_run:
+            reduced_val_loss = reduce_tensor(loss.data, n_gpus)[0] \
-                reduced_val_loss = reduce_tensor(loss.data, num_gpus).item()
+                if distributed_run else loss.data[0]
            else:
                reduced_val_loss = loss.item()
            val_loss += reduced_val_loss
        val_loss = val_loss / (i + 1)
    model.train()
-    print("Validation loss {}: {:9f}  ".format(iteration, reduced_val_loss))
+    return val_loss
    logger.log_validation(reduced_val_loss, model, y, y_pred, iteration)
 def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
@@ -175,9 +173,6 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
        optimizer = FP16_Optimizer(
            optimizer, dynamic_loss_scale=hparams.dynamic_loss_scaling)
    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)
    criterion = Tacotron2Loss()
    logger = prepare_directories_and_logger(
@@ -192,14 +187,16 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
        if warm_start:
            model = warm_start_model(checkpoint_path, model)
        else:
-            model, optimizer, _learning_rate, iteration = load_checkpoint(
+            model, optimizer, learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))
    model.train()
    if hparams.distributed_run or torch.cuda.device_count() > 1:
        batch_parser = model.module.parse_batch
    else:
        batch_parser = model.parse_batch
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))
@@ -209,21 +206,18 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
                param_group['lr'] = learning_rate
            model.zero_grad()
-            x, y = model.parse_batch(batch)
+            x, y = batch_parser(batch)
            y_pred = model(x)
            loss = criterion(y_pred, y)
-            if hparams.distributed_run:
+            reduced_loss = reduce_tensor(loss.data, n_gpus)[0] \
-                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
+                if hparams.distributed_run else loss.data[0]
            else:
                reduced_loss = loss.item()
            if hparams.fp16_run:
                optimizer.backward(loss)
                grad_norm = optimizer.clip_fp32_grads(hparams.grad_clip_thresh)
            else:
                loss.backward()
-                grad_norm = torch.nn.utils.clip_grad_norm_(
+                grad_norm = torch.nn.utils.clip_grad_norm(
                    model.parameters(), hparams.grad_clip_thresh)
            optimizer.step()
@@ -234,14 +228,20 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
                duration = time.perf_counter() - start
                print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                    iteration, reduced_loss, grad_norm, duration))
                logger.log_training(
                    reduced_loss, grad_norm, learning_rate, duration, iteration)
            if not overflow and (iteration % hparams.iters_per_checkpoint == 0):
-                validate(model, criterion, valset, iteration, hparams.batch_size,
+                reduced_val_loss = validate(
                    model, criterion, valset, iteration, hparams.batch_size,
                    n_gpus, collate_fn, logger, hparams.distributed_run, rank)
                if rank == 0:
                    print("Validation loss {}: {:9f}  ".format(
                        iteration, reduced_val_loss))
                    logger.log_validation(
                        reduced_val_loss, model, y, y_pred, iteration)
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
@@ -276,7 +276,7 @@ if __name__ == '__main__':
    torch.backends.cudnn.benchmark = hparams.cudnn_benchmark
    print("FP16 Run:", hparams.fp16_run)
-    print("Dynamic Loss Scaling:", hparams.dynamic_loss_scaling)
+    print("Dynamic Loss Scaling", hparams.dynamic_loss_scaling)
    print("Distributed Run:", hparams.distributed_run)
    print("cuDNN Enabled:", hparams.cudnn_enabled)
    print("cuDNN Benchmark:", hparams.cudnn_benchmark)
--- a/utils.py
+++ b/utils.py
@@ -4,26 +4,29 @@ import torch
 def get_mask_from_lengths(lengths):
-    max_len = torch.max(lengths).item()
+    max_len = torch.max(lengths)
-    ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len))
+    ids = torch.arange(0, max_len, out=torch.LongTensor(max_len)).cuda()
    mask = (ids < lengths.unsqueeze(1)).byte()
    return mask
-def load_wav_to_torch(full_path):
+def load_wav_to_torch(full_path, sr):
    sampling_rate, data = read(full_path)
-    return torch.FloatTensor(data.astype(np.float32)), sampling_rate
+    assert sr == sampling_rate, "{} SR doesn't match {} on path {}".format(
        sr, sampling_rate, full_path)
    return torch.FloatTensor(data.astype(np.float32))
-def load_filepaths_and_text(filename, split="|"):
+def load_filepaths_and_text(filename, sort_by_length, split="|"):
    with open(filename, encoding='utf-8') as f:
        filepaths_and_text = [line.strip().split(split) for line in f]
    if sort_by_length:
        filepaths_and_text.sort(key=lambda x: len(x[1]))
    return filepaths_and_text
 def to_gpu(x):
-    x = x.contiguous()
+    x = x.contiguous().cuda(async=True)
    if torch.cuda.is_available():
        x = x.cuda(non_blocking=True)
    return torch.autograd.Variable(x)
--- a/1
+++ b/1