1
0
mirror of https://github.com/malarinv/tacotron2 synced 2026-03-08 09:42:34 +00:00

26 Commits

Author SHA1 Message Date
Rafael Valle
d5b64729d1 model.py: moving for better readibility 2018-05-20 12:22:06 -07:00
Rafael Valle
977cb37cea model.py: attending to full mel instead of prenet and dropout mel 2018-05-18 06:59:09 -07:00
Rafael Valle
da30fd8709 Merge pull request #20 from NVIDIA/fp16_path
Fp16 patch, not path!
2018-05-15 09:55:19 -07:00
Rafael Valle
27b1767cb2 train.py: fixing typo 2018-05-15 09:53:33 -07:00
Rafael Valle
817cd403d4 Merge branch 'master' of https://github.com/NVIDIA/tacotron2 into load_mel_from_disk 2018-05-15 09:51:41 -07:00
Rafael Valle
1071023017 train.py: patching score_mask_value formerly inf, not concrete value, for compatibility with pytorch 2018-05-15 09:50:56 -07:00
Rafael Valle
cd851585cb loss_scaler.py: patching loss scaler for compatibility with current pytorch 2018-05-15 09:50:08 -07:00
Rafael Valle
bd42cb6ed7 Merge pull request #19 from NVIDIA/load_mel_from_disk
Load mel from disk
2018-05-15 08:54:24 -07:00
Rafael Valle
2da7a2ebab README.md: describing how to load mel from disk 2018-05-15 08:50:21 -07:00
Rafael Valle
62d2c8b957 data_utils.py: adding support for loading mel from disk 2018-05-15 08:42:06 -07:00
Rafael Valle
2d41ea0682 hparams.py: adding load_mel_from_disk params 2018-05-15 08:41:03 -07:00
Rafael Valle
20568643cb Merge branch 'master' of https://github.com/NVIDIA/tacotron2 2018-05-06 08:58:07 -07:00
Rafael Valle
dcd925f6c8 model.py: mixed squeeze target. fixing 2018-05-06 08:58:01 -07:00
Raul Puri
4ac6ce9ab5 ipynb typo 2018-05-05 17:30:08 -07:00
Raul Puri
c67ca6531e force single gpu in inference.ipynb 2018-05-05 17:29:09 -07:00
Raul Puri
78d5150d83 inference (distributed) dataparallel patch
removing the '.module' that comes from (distibuted)dataparallel state dict
2018-05-05 17:23:11 -07:00
Rafael Valle
424b2f5bf0 README.md: fixing typo as pointed out by @syoyo 2018-05-04 23:06:58 -07:00
Rafael Valle
a38429e629 Merge pull request #6 from NVIDIA/padding-patch-0.4
integer maxlen for padding
2018-05-04 13:24:57 -07:00
Raul Puri
b20765a3dc 0.4 scalar tensor padding update 2018-05-04 12:12:08 -07:00
Raul Puri
2a394f4aaa integer maxlen for padding 2018-05-04 11:11:14 -07:00
Rafael Valle
2c545ac800 Merge pull request #4 from NVIDIA/mask-utils-0.4
mask utils update for 0.4 cuda
2018-05-04 11:02:30 -07:00
Raul Puri
6fbba8ef0f mask utils update for 0.4 cuda 2018-05-04 10:14:30 -07:00
Rafael Valle
c141726a96 requirements.txt: updating tensorflow requirements 2018-05-04 09:44:14 -07:00
Rafael Valle
535042a584 README.md: updating readme to include docker setup 2018-05-04 09:42:11 -07:00
Rafael Valle
a72160b8cb Dockerfile: adding dockerfile 2018-05-04 09:39:34 -07:00
Rafael Valle
d750fcf395 Merge pull request #2 from NVIDIA/single-gpu-and-0.4
train.py single gpu and 0.4 update
2018-05-04 09:12:13 -07:00
10 changed files with 47 additions and 33 deletions

View File

@@ -1,3 +1,4 @@
FROM pytorch/pytorch:0.4_cuda9_cudnn7 FROM pytorch/pytorch:0.4_cuda9_cudnn7
RUN pip install numpy scipy matplotlib librosa==0.6.0 tensorflow tensorboardX inflect==0.2.5 Unidecode==1.0.22 RUN pip install numpy scipy matplotlib librosa==0.6.0 tensorflow tensorboardX
inflect==0.2.5 Unidecode==1.0.22

View File

@@ -20,11 +20,12 @@ Distributed and FP16 support relies on work by Christian Sarofeen and NVIDIA's
2. Clone this repo: `git clone https://github.com/NVIDIA/tacotron2.git` 2. Clone this repo: `git clone https://github.com/NVIDIA/tacotron2.git`
3. CD into this repo: `cd tacotron2` 3. CD into this repo: `cd tacotron2`
4. Update .wav paths: `sed -i -- 's,DUMMY,ljs_dataset_folder/wavs,g' filelists/*.txt` 4. Update .wav paths: `sed -i -- 's,DUMMY,ljs_dataset_folder/wavs,g' filelists/*.txt`
- Alternatively, set `load_mel_from_disk=True` in `hparams.py` and update mel-spectrogram paths
5. Install [pytorch 0.4](https://github.com/pytorch/pytorch) 5. Install [pytorch 0.4](https://github.com/pytorch/pytorch)
6. Install python requirements or use docker container (tbd) 6. Install python requirements or build docker image
- Install python requirements: `pip install requirements.txt` - Install python requirements: `pip install -r requirements.txt`
- **OR** - **OR**
- Docker container `(tbd)` - Build docker image: `docker build --tag tacotron2 .`
## Training ## Training
1. `python train.py --output_directory=outdir --log_directory=logdir` 1. `python train.py --output_directory=outdir --log_directory=logdir`

View File

@@ -1,4 +1,5 @@
import random import random
import numpy as np
import torch import torch
import torch.utils.data import torch.utils.data
@@ -19,6 +20,7 @@ class TextMelLoader(torch.utils.data.Dataset):
self.text_cleaners = hparams.text_cleaners self.text_cleaners = hparams.text_cleaners
self.max_wav_value = hparams.max_wav_value self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate self.sampling_rate = hparams.sampling_rate
self.load_mel_from_disk = hparams.load_mel_from_disk
self.stft = layers.TacotronSTFT( self.stft = layers.TacotronSTFT(
hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.filter_length, hparams.hop_length, hparams.win_length,
hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
@@ -35,12 +37,19 @@ class TextMelLoader(torch.utils.data.Dataset):
return (text, mel) return (text, mel)
def get_mel(self, filename): def get_mel(self, filename):
audio = load_wav_to_torch(filename, self.sampling_rate) if not self.load_mel_from_disk:
audio_norm = audio / self.max_wav_value audio = load_wav_to_torch(filename, self.sampling_rate)
audio_norm = audio_norm.unsqueeze(0) audio_norm = audio / self.max_wav_value
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) audio_norm = audio_norm.unsqueeze(0)
melspec = self.stft.mel_spectrogram(audio_norm) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
melspec = torch.squeeze(melspec, 0) melspec = self.stft.mel_spectrogram(audio_norm)
melspec = torch.squeeze(melspec, 0)
else:
melspec = torch.from_numpy(np.load(filename))
assert melspec.size(0) == self.stft.n_mel_channels, (
'Mel dimension mismatch: given {}, expected {}'.format(
melspec.size(0), self.stft.n_mel_channels))
return melspec return melspec
def get_text(self, text): def get_text(self, text):

View File

@@ -23,6 +23,7 @@ def create_hparams(hparams_string=None, verbose=False):
################################ ################################
# Data Parameters # # Data Parameters #
################################ ################################
load_mel_from_disk=False,
training_files='filelists/ljs_audio_text_train_filelist.txt', training_files='filelists/ljs_audio_text_train_filelist.txt',
validation_files='filelists/ljs_audio_text_val_filelist.txt', validation_files='filelists/ljs_audio_text_val_filelist.txt',
text_cleaners=['english_cleaners'], text_cleaners=['english_cleaners'],

View File

@@ -98,8 +98,11 @@
"source": [ "source": [
"checkpoint_path = \"/home/scratch.adlr-gcf/audio_denoising/runs/TTS-Tacotron2-LJS-MSE-DRC-NoMaskPadding-Unsorted-Distributed-22khz/checkpoint_15500\"\n", "checkpoint_path = \"/home/scratch.adlr-gcf/audio_denoising/runs/TTS-Tacotron2-LJS-MSE-DRC-NoMaskPadding-Unsorted-Distributed-22khz/checkpoint_15500\"\n",
"model = load_model(hparams)\n", "model = load_model(hparams)\n",
"model.load_state_dict(torch.load(checkpoint_path)['state_dict'])\n", "try:\n",
"model = model.module\n", " model = model.module\n",
"except:\n",
" pass\n",
"model.load_state_dict({k.replace('module.',''):v for k,v in torch.load(checkpoint_path)['state_dict'].items()})\n",
"_ = model.eval()" "_ = model.eval()"
] ]
}, },

View File

@@ -51,11 +51,10 @@ class DynamicLossScaler:
# `x` is a torch.Tensor # `x` is a torch.Tensor
def _has_inf_or_nan(x): def _has_inf_or_nan(x):
inf_count = torch.sum(x.abs() == float('inf')) cpu_sum = float(x.float().sum())
if inf_count > 0: if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
return True return True
nan_count = torch.sum(x != x) return False
return nan_count > 0
# `overflow` is boolean indicating whether we overflowed in gradient # `overflow` is boolean indicating whether we overflowed in gradient
def update_scale(self, overflow): def update_scale(self, overflow):

View File

@@ -221,7 +221,7 @@ class Decoder(nn.Module):
[hparams.prenet_dim, hparams.prenet_dim]) [hparams.prenet_dim, hparams.prenet_dim])
self.attention_rnn = nn.LSTMCell( self.attention_rnn = nn.LSTMCell(
hparams.prenet_dim + hparams.encoder_embedding_dim, hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
hparams.attention_rnn_dim) hparams.attention_rnn_dim)
self.attention_layer = Attention( self.attention_layer = Attention(
@@ -230,7 +230,7 @@ class Decoder(nn.Module):
hparams.attention_location_kernel_size) hparams.attention_location_kernel_size)
self.decoder_rnn = nn.LSTMCell( self.decoder_rnn = nn.LSTMCell(
hparams.attention_rnn_dim + hparams.encoder_embedding_dim, hparams.prenet_dim + hparams.encoder_embedding_dim,
hparams.decoder_rnn_dim, 1) hparams.decoder_rnn_dim, 1)
self.linear_projection = LinearNorm( self.linear_projection = LinearNorm(
@@ -351,8 +351,7 @@ class Decoder(nn.Module):
attention_weights: attention_weights:
""" """
decoder_input = self.prenet(decoder_input) cell_input = torch.cat((self.decoder_hidden, self.attention_context), -1)
cell_input = torch.cat((decoder_input, self.attention_context), -1)
self.attention_hidden, self.attention_cell = self.attention_rnn( self.attention_hidden, self.attention_cell = self.attention_rnn(
cell_input, (self.attention_hidden, self.attention_cell)) cell_input, (self.attention_hidden, self.attention_cell))
@@ -364,8 +363,8 @@ class Decoder(nn.Module):
attention_weights_cat, self.mask) attention_weights_cat, self.mask)
self.attention_weights_cum += self.attention_weights self.attention_weights_cum += self.attention_weights
decoder_input = torch.cat( prenet_output = self.prenet(decoder_input)
(self.attention_hidden, self.attention_context), -1) decoder_input = torch.cat((prenet_output, self.attention_context), -1)
self.decoder_hidden, self.decoder_cell = self.decoder_rnn( self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
decoder_input, (self.decoder_hidden, self.decoder_cell)) decoder_input, (self.decoder_hidden, self.decoder_cell))
@@ -402,9 +401,8 @@ class Decoder(nn.Module):
while len(mel_outputs) < decoder_inputs.size(0): while len(mel_outputs) < decoder_inputs.size(0):
mel_output, gate_output, attention_weights = self.decode( mel_output, gate_output, attention_weights = self.decode(
decoder_input) decoder_input)
mel_outputs += [mel_output]
mel_outputs += [mel_output.squeeze(1)] gate_outputs += [gate_output.squeeze(1)]
gate_outputs += [gate_output.squeeze()]
alignments += [attention_weights] alignments += [attention_weights]
decoder_input = decoder_inputs[len(mel_outputs) - 1] decoder_input = decoder_inputs[len(mel_outputs) - 1]
@@ -431,12 +429,11 @@ class Decoder(nn.Module):
self.initialize_decoder_states(memory, mask=None) self.initialize_decoder_states(memory, mask=None)
mel_outputs, gate_outputs, alignments = [], [], [] mel_outputs, gate_outputs, alignments = [], [], []
while True: while True:
mel_output, gate_output, alignment = self.decode(decoder_input) mel_output, gate_output, alignment = self.decode(decoder_input)
mel_outputs += [mel_output.squeeze(1)] mel_outputs += [mel_output]
gate_outputs += [gate_output.squeeze()] gate_outputs += [gate_output.squeeze(1)]
alignments += [alignment] alignments += [alignment]
if F.sigmoid(gate_output.data) > self.gate_threshold: if F.sigmoid(gate_output.data) > self.gate_threshold:
@@ -470,8 +467,8 @@ class Tacotron2(nn.Module):
text_padded, input_lengths, mel_padded, gate_padded, \ text_padded, input_lengths, mel_padded, gate_padded, \
output_lengths = batch output_lengths = batch
text_padded = to_gpu(text_padded).long() text_padded = to_gpu(text_padded).long()
max_len = int(torch.max(input_lengths.data).numpy())
input_lengths = to_gpu(input_lengths).long() input_lengths = to_gpu(input_lengths).long()
max_len = torch.max(input_lengths.data)
mel_padded = to_gpu(mel_padded).float() mel_padded = to_gpu(mel_padded).float()
gate_padded = to_gpu(gate_padded).float() gate_padded = to_gpu(gate_padded).float()
output_lengths = to_gpu(output_lengths).long() output_lengths = to_gpu(output_lengths).long()

View File

@@ -1,6 +1,6 @@
torch==0.2.0.post3 torch==0.2.0.post3
matplotlib==2.1.0 matplotlib==2.1.0
tensorflow==1.5.0 tensorflow
numpy==1.13.3 numpy==1.13.3
inflect==0.2.5 inflect==0.2.5
librosa==0.6.0 librosa==0.6.0

View File

@@ -2,6 +2,7 @@ import os
import time import time
import argparse import argparse
import math import math
from numpy import finfo
import torch import torch
from distributed import DistributedDataParallel from distributed import DistributedDataParallel
@@ -77,7 +78,9 @@ def prepare_directories_and_logger(output_directory, log_directory, rank):
def load_model(hparams): def load_model(hparams):
model = Tacotron2(hparams).cuda() model = Tacotron2(hparams).cuda()
model = batchnorm_to_float(model.half()) if hparams.fp16_run else model if hparams.fp16_run:
model = batchnorm_to_float(model.half())
model.decoder.attention_layer.score_mask_value = float(finfo('float16').min)
if hparams.distributed_run: if hparams.distributed_run:
model = DistributedDataParallel(model) model = DistributedDataParallel(model)
@@ -276,7 +279,7 @@ if __name__ == '__main__':
torch.backends.cudnn.benchmark = hparams.cudnn_benchmark torch.backends.cudnn.benchmark = hparams.cudnn_benchmark
print("FP16 Run:", hparams.fp16_run) print("FP16 Run:", hparams.fp16_run)
print("Dynamic Loss Scaling", hparams.dynamic_loss_scaling) print("Dynamic Loss Scaling:", hparams.dynamic_loss_scaling)
print("Distributed Run:", hparams.distributed_run) print("Distributed Run:", hparams.distributed_run)
print("cuDNN Enabled:", hparams.cudnn_enabled) print("cuDNN Enabled:", hparams.cudnn_enabled)
print("cuDNN Benchmark:", hparams.cudnn_benchmark) print("cuDNN Benchmark:", hparams.cudnn_benchmark)

View File

@@ -5,7 +5,7 @@ import torch
def get_mask_from_lengths(lengths): def get_mask_from_lengths(lengths):
max_len = torch.max(lengths) max_len = torch.max(lengths)
ids = torch.arange(0, max_len, out=torch.LongTensor(max_len)).cuda() ids = torch.arange(0, max_len).long().cuda()
mask = (ids < lengths.unsqueeze(1)).byte() mask = (ids < lengths.unsqueeze(1)).byte()
return mask return mask