train.py: patching score_mask_value formerly inf, not concrete value, for compatibility with pytorch

loss_scaler.py: patching loss scaler for compatibility with current pytorch
README.md: describing how to load mel from disk
2026-06-13 10:52:06 +00:00 · 2018-05-15 09:50:56 -07:00 · 2018-05-15 09:50:08 -07:00 · 2018-05-15 08:50:21 -07:00 · 2018-05-15 08:42:06 -07:00 · 2018-05-15 08:41:03 -07:00
7 changed files with 35 additions and 21 deletions
--- a/README.md
+++ b/README.md
@@ -20,9 +20,10 @@ Distributed and FP16 support relies on work by Christian Sarofeen and NVIDIA's
 2. Clone this repo: `git clone https://github.com/NVIDIA/tacotron2.git`
 3. CD into this repo: `cd tacotron2`
 4. Update .wav paths: `sed -i -- 's,DUMMY,ljs_dataset_folder/wavs,g' filelists/*.txt`
+    - Alternatively, set `load_mel_from_disk=True` in `hparams.py` and update mel-spectrogram paths 
 5. Install [pytorch 0.4](https://github.com/pytorch/pytorch)
 6. Install python requirements or build docker image 
-    - Install python requirements: `pip install requirements.txt`
+    - Install python requirements: `pip install -r requirements.txt`
    - **OR**
    - Build docker image: `docker build --tag tacotron2 .` 

--- a/data_utils.py
+++ b/data_utils.py
@@ -1,4 +1,5 @@
 import random
+import numpy as np
 import torch
 import torch.utils.data

@@ -19,6 +20,7 @@ class TextMelLoader(torch.utils.data.Dataset):
        self.text_cleaners = hparams.text_cleaners
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
+        self.load_mel_from_disk = hparams.load_mel_from_disk
        self.stft = layers.TacotronSTFT(
            hparams.filter_length, hparams.hop_length, hparams.win_length,
            hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
@@ -35,12 +37,19 @@ class TextMelLoader(torch.utils.data.Dataset):
        return (text, mel)

    def get_mel(self, filename):
-        audio = load_wav_to_torch(filename, self.sampling_rate)
-        audio_norm = audio / self.max_wav_value
-        audio_norm = audio_norm.unsqueeze(0)
-        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
-        melspec = self.stft.mel_spectrogram(audio_norm)
-        melspec = torch.squeeze(melspec, 0)
+        if not self.load_mel_from_disk:
+            audio = load_wav_to_torch(filename, self.sampling_rate)
+            audio_norm = audio / self.max_wav_value
+            audio_norm = audio_norm.unsqueeze(0)
+            audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
+            melspec = self.stft.mel_spectrogram(audio_norm)
+            melspec = torch.squeeze(melspec, 0)
+        else:
+            melspec = torch.from_numpy(np.load(filename))
+            assert melspec.size(0) == self.stft.n_mel_channels, (
+                'Mel dimension mismatch: given {}, expected {}'.format(
+                    melspec.size(0), self.stft.n_mel_channels))
+
        return melspec

    def get_text(self, text):
--- a/hparams.py
+++ b/hparams.py
@@ -23,6 +23,7 @@ def create_hparams(hparams_string=None, verbose=False):
        ################################
        # Data Parameters             #
        ################################
+        load_mel_from_disk=False,
        training_files='filelists/ljs_audio_text_train_filelist.txt',
        validation_files='filelists/ljs_audio_text_val_filelist.txt',
        text_cleaners=['english_cleaners'],
--- a/inference.ipynb
+++ b/inference.ipynb
@@ -98,8 +98,11 @@
   "source": [
    "checkpoint_path = \"/home/scratch.adlr-gcf/audio_denoising/runs/TTS-Tacotron2-LJS-MSE-DRC-NoMaskPadding-Unsorted-Distributed-22khz/checkpoint_15500\"\n",
    "model = load_model(hparams)\n",
-    "model.load_state_dict(torch.load(checkpoint_path)['state_dict'])\n",
-    "model = model.module\n",
+    "try:\n",
+    "    model = model.module\n",
+    "except:\n",
+    "    pass\n",
+    "model.load_state_dict({k.replace('module.',''):v for k,v in torch.load(checkpoint_path)['state_dict'].items()})\n",
    "_ = model.eval()"
   ]
  },
--- a/loss_scaler.py
+++ b/loss_scaler.py
@@ -51,11 +51,10 @@ class DynamicLossScaler:

    # `x` is a torch.Tensor
    def _has_inf_or_nan(x):
-        inf_count = torch.sum(x.abs() == float('inf'))
-        if inf_count > 0:
+        cpu_sum = float(x.float().sum())
+        if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
            return True
-        nan_count = torch.sum(x != x)
-        return nan_count > 0
+        return False

    # `overflow` is boolean indicating whether we overflowed in gradient
    def update_scale(self, overflow):
--- a/model.py
+++ b/model.py
@@ -402,9 +402,8 @@ class Decoder(nn.Module):
        while len(mel_outputs) < decoder_inputs.size(0):
            mel_output, gate_output, attention_weights = self.decode(
                decoder_input)
-
-            mel_outputs += [mel_output.squeeze(1)]
-            gate_outputs += [gate_output.squeeze()]
+            mel_outputs += [mel_output]
+            gate_outputs += [gate_output.squeeze(1)]
            alignments += [attention_weights]

            decoder_input = decoder_inputs[len(mel_outputs) - 1]
@@ -431,12 +430,11 @@ class Decoder(nn.Module):
        self.initialize_decoder_states(memory, mask=None)

        mel_outputs, gate_outputs, alignments = [], [], []
-
        while True:
            mel_output, gate_output, alignment = self.decode(decoder_input)

-            mel_outputs += [mel_output.squeeze(1)]
-            gate_outputs += [gate_output.squeeze()]
+            mel_outputs += [mel_output]
+            gate_outputs += [gate_output.squeeze(1)]
            alignments += [alignment]

            if F.sigmoid(gate_output.data) > self.gate_threshold:
@@ -470,8 +468,8 @@ class Tacotron2(nn.Module):
        text_padded, input_lengths, mel_padded, gate_padded, \
            output_lengths = batch
        text_padded = to_gpu(text_padded).long()
+        max_len = int(torch.max(input_lengths.data).numpy())
        input_lengths = to_gpu(input_lengths).long()
-        max_len = torch.max(input_lengths.data)
        mel_padded = to_gpu(mel_padded).float()
        gate_padded = to_gpu(gate_padded).float()
        output_lengths = to_gpu(output_lengths).long()
--- a/train.py
+++ b/train.py
@@ -2,6 +2,7 @@ import os
 import time
 import argparse
 import math
+from numpy import finfo

 import torch
 from distributed import DistributedDataParallel
@@ -77,7 +78,9 @@ def prepare_directories_and_logger(output_directory, log_directory, rank):

 def load_model(hparams):
    model = Tacotron2(hparams).cuda()
-    model = batchnorm_to_float(model.half()) if hparams.fp16_run else model
+    if hparams.fp16_run:
+        model = batchnorm_to_float(model.half())
+        model.decoder.attention_layer.score_mask_value = float(finfo('float16').min)

    if hparams.distributed_run:
        model = DistributedDataParallel(model)
Author	SHA1	Message	Date
Rafael Valle	1071023017	train.py: patching score_mask_value formerly inf, not concrete value, for compatibility with pytorch	2018-05-15 09:50:56 -07:00
Rafael Valle	cd851585cb	loss_scaler.py: patching loss scaler for compatibility with current pytorch	2018-05-15 09:50:08 -07:00
Rafael Valle	2da7a2ebab	README.md: describing how to load mel from disk	2018-05-15 08:50:21 -07:00
Rafael Valle	62d2c8b957	data_utils.py: adding support for loading mel from disk	2018-05-15 08:42:06 -07:00
Rafael Valle	2d41ea0682	hparams.py: adding load_mel_from_disk params	2018-05-15 08:41:03 -07:00
Rafael Valle	20568643cb	Merge branch 'master' of https://github.com/NVIDIA/tacotron2	2018-05-06 08:58:07 -07:00
Rafael Valle	dcd925f6c8	model.py: mixed squeeze target. fixing	2018-05-06 08:58:01 -07:00
Raul Puri	4ac6ce9ab5	ipynb typo	2018-05-05 17:30:08 -07:00
Raul Puri	c67ca6531e	force single gpu in inference.ipynb	2018-05-05 17:29:09 -07:00
Raul Puri	78d5150d83	inference (distributed) dataparallel patch removing the '.module' that comes from (distibuted)dataparallel state dict	2018-05-05 17:23:11 -07:00
Rafael Valle	424b2f5bf0	README.md: fixing typo as pointed out by @syoyo	2018-05-04 23:06:58 -07:00
Rafael Valle	a38429e629	Merge pull request #6 from NVIDIA/padding-patch-0.4 integer maxlen for padding	2018-05-04 13:24:57 -07:00
Raul Puri	b20765a3dc	0.4 scalar tensor padding update	2018-05-04 12:12:08 -07:00
Raul Puri	2a394f4aaa	integer maxlen for padding	2018-05-04 11:11:14 -07:00
Rafael Valle	2c545ac800	Merge pull request #4 from NVIDIA/mask-utils-0.4 mask utils update for 0.4 cuda	2018-05-04 11:02:30 -07:00