1
0
mirror of https://github.com/malarinv/tacotron2 synced 2026-03-09 02:02:33 +00:00

1 Commits

Author SHA1 Message Date
Rafael Valle
723e869d4b Dockerfile: adding dockerfile 2018-05-04 09:39:05 -07:00
18 changed files with 178 additions and 247 deletions

3
.gitmodules vendored
View File

@@ -1,3 +0,0 @@
[submodule "waveglow"]
path = waveglow
url = https://github.com/NVIDIA/waveglow

View File

@@ -1,2 +1,3 @@
FROM pytorch/pytorch:0.4_cuda9_cudnn7 FROM pytorch/pytorch:0.4_cuda9_cudnn7
RUN pip install numpy scipy matplotlib librosa==0.6.0 tensorflow tensorboardX inflect==0.2.5 Unidecode==1.0.22 jupyter
RUN pip install numpy scipy matplotlib librosa==0.6.0 tensorflow tensorboardX inflect==0.2.5 Unidecode==1.0.22

View File

@@ -11,9 +11,6 @@ Distributed and FP16 support relies on work by Christian Sarofeen and NVIDIA's
![Alignment, Predicted Mel Spectrogram, Target Mel Spectrogram](tensorboard.png) ![Alignment, Predicted Mel Spectrogram, Target Mel Spectrogram](tensorboard.png)
[Download demo audio](https://github.com/NVIDIA/tacotron2/blob/master/demo.wav) trained on LJS and using Ryuchi Yamamoto's [pre-trained Mixture of Logistics
wavenet](https://github.com/r9y9/wavenet_vocoder/)
"Scientists at the CERN laboratory say they have discovered a new particle."
## Pre-requisites ## Pre-requisites
1. NVIDIA GPU + CUDA cuDNN 1. NVIDIA GPU + CUDA cuDNN
@@ -23,12 +20,11 @@ wavenet](https://github.com/r9y9/wavenet_vocoder/)
2. Clone this repo: `git clone https://github.com/NVIDIA/tacotron2.git` 2. Clone this repo: `git clone https://github.com/NVIDIA/tacotron2.git`
3. CD into this repo: `cd tacotron2` 3. CD into this repo: `cd tacotron2`
4. Update .wav paths: `sed -i -- 's,DUMMY,ljs_dataset_folder/wavs,g' filelists/*.txt` 4. Update .wav paths: `sed -i -- 's,DUMMY,ljs_dataset_folder/wavs,g' filelists/*.txt`
- Alternatively, set `load_mel_from_disk=True` in `hparams.py` and update mel-spectrogram paths
5. Install [pytorch 0.4](https://github.com/pytorch/pytorch) 5. Install [pytorch 0.4](https://github.com/pytorch/pytorch)
6. Install python requirements or build docker image 6. Install python requirements or use docker container (tbd)
- Install python requirements: `pip install -r requirements.txt` - Install python requirements: `pip install requirements.txt`
- **OR** - **OR**
- Build docker image: `docker build --tag tacotron2 .` - Docker container `(tbd)`
## Training ## Training
1. `python train.py --output_directory=outdir --log_directory=logdir` 1. `python train.py --output_directory=outdir --log_directory=logdir`
@@ -38,13 +34,9 @@ wavenet](https://github.com/r9y9/wavenet_vocoder/)
1. `python -m multiproc train.py --output_directory=outdir --log_directory=logdir --hparams=distributed_run=True,fp16_run=True` 1. `python -m multiproc train.py --output_directory=outdir --log_directory=logdir --hparams=distributed_run=True,fp16_run=True`
## Inference ## Inference
When performing Mel-Spectrogram to Audio synthesis with a WaveNet model, make sure Tacotron 2 and WaveNet were trained on the same mel-spectrogram representation. Follow these steps to use a a simple inference pipeline using griffin-lim:
1. `jupyter notebook --ip=127.0.0.1 --port=31337` 1. `jupyter notebook --ip=127.0.0.1 --port=31337`
2. load inference.ipynb 2. load inference.ipynb
## Related repos ## Related repos
[nv-wavenet](https://github.com/NVIDIA/nv-wavenet/): Faster than real-time [nv-wavenet](https://github.com/NVIDIA/nv-wavenet/): Faster than real-time
wavenet inference wavenet inference

View File

@@ -1,5 +1,4 @@
import random import random
import numpy as np
import torch import torch
import torch.utils.data import torch.utils.data
@@ -14,17 +13,18 @@ class TextMelLoader(torch.utils.data.Dataset):
2) normalizes text and converts them to sequences of one-hot vectors 2) normalizes text and converts them to sequences of one-hot vectors
3) computes mel-spectrograms from audio files. 3) computes mel-spectrograms from audio files.
""" """
def __init__(self, audiopaths_and_text, hparams): def __init__(self, audiopaths_and_text, hparams, shuffle=True):
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.audiopaths_and_text = load_filepaths_and_text(
audiopaths_and_text, hparams.sort_by_length)
self.text_cleaners = hparams.text_cleaners self.text_cleaners = hparams.text_cleaners
self.max_wav_value = hparams.max_wav_value self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate self.sampling_rate = hparams.sampling_rate
self.load_mel_from_disk = hparams.load_mel_from_disk
self.stft = layers.TacotronSTFT( self.stft = layers.TacotronSTFT(
hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.filter_length, hparams.hop_length, hparams.win_length,
hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
hparams.mel_fmax) hparams.mel_fmax)
random.seed(1234) random.seed(1234)
if shuffle:
random.shuffle(self.audiopaths_and_text) random.shuffle(self.audiopaths_and_text)
def get_mel_text_pair(self, audiopath_and_text): def get_mel_text_pair(self, audiopath_and_text):
@@ -35,22 +35,12 @@ class TextMelLoader(torch.utils.data.Dataset):
return (text, mel) return (text, mel)
def get_mel(self, filename): def get_mel(self, filename):
if not self.load_mel_from_disk: audio = load_wav_to_torch(filename, self.sampling_rate)
audio, sampling_rate = load_wav_to_torch(filename)
if sampling_rate != self.stft.sampling_rate:
raise ValueError("{} {} SR doesn't match target {} SR".format(
sampling_rate, self.stft.sampling_rate))
audio_norm = audio / self.max_wav_value audio_norm = audio / self.max_wav_value
audio_norm = audio_norm.unsqueeze(0) audio_norm = audio_norm.unsqueeze(0)
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
melspec = self.stft.mel_spectrogram(audio_norm) melspec = self.stft.mel_spectrogram(audio_norm)
melspec = torch.squeeze(melspec, 0) melspec = torch.squeeze(melspec, 0)
else:
melspec = torch.from_numpy(np.load(filename))
assert melspec.size(0) == self.stft.n_mel_channels, (
'Mel dimension mismatch: given {}, expected {}'.format(
melspec.size(0), self.stft.n_mel_channels))
return melspec return melspec
def get_text(self, text): def get_text(self, text):
@@ -88,9 +78,9 @@ class TextMelCollate():
text = batch[ids_sorted_decreasing[i]][0] text = batch[ids_sorted_decreasing[i]][0]
text_padded[i, :text.size(0)] = text text_padded[i, :text.size(0)] = text
# Right zero-pad mel-spec # Right zero-pad mel-spec with extra single zero vector to mark the end
num_mels = batch[0][1].size(0) num_mels = batch[0][1].size(0)
max_target_len = max([x[1].size(1) for x in batch]) max_target_len = max([x[1].size(1) for x in batch]) + 1
if max_target_len % self.n_frames_per_step != 0: if max_target_len % self.n_frames_per_step != 0:
max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
assert max_target_len % self.n_frames_per_step == 0 assert max_target_len % self.n_frames_per_step == 0
@@ -104,7 +94,7 @@ class TextMelCollate():
for i in range(len(ids_sorted_decreasing)): for i in range(len(ids_sorted_decreasing)):
mel = batch[ids_sorted_decreasing[i]][1] mel = batch[ids_sorted_decreasing[i]][1]
mel_padded[i, :, :mel.size(1)] = mel mel_padded[i, :, :mel.size(1)] = mel
gate_padded[i, mel.size(1)-1:] = 1 gate_padded[i, mel.size(1):] = 1
output_lengths[i] = mel.size(1) output_lengths[i] = mel.size(1)
return text_padded, input_lengths, mel_padded, gate_padded, \ return text_padded, input_lengths, mel_padded, gate_padded, \

BIN
demo.wav

Binary file not shown.

View File

@@ -118,55 +118,3 @@ class DistributedDataParallel(Module):
super(DistributedDataParallel, self).train(mode) super(DistributedDataParallel, self).train(mode)
self.module.train(mode) self.module.train(mode)
''' '''
'''
Modifies existing model to do gradient allreduce, but doesn't change class
so you don't need "module"
'''
def apply_gradient_allreduce(module):
if not hasattr(dist, '_backend'):
module.warn_on_half = True
else:
module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
for p in module.state_dict().values():
if not torch.is_tensor(p):
continue
dist.broadcast(p, 0)
def allreduce_params():
if(module.needs_reduction):
module.needs_reduction = False
buckets = {}
for param in module.parameters():
if param.requires_grad and param.grad is not None:
tp = type(param.data)
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(param)
if module.warn_on_half:
if torch.cuda.HalfTensor in buckets:
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
" It is recommended to use the NCCL backend in this case. This currently requires" +
"PyTorch built from top of tree master.")
module.warn_on_half = False
for tp in buckets:
bucket = buckets[tp]
grads = [param.grad.data for param in bucket]
coalesced = _flatten_dense_tensors(grads)
dist.all_reduce(coalesced)
coalesced /= dist.get_world_size()
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
buf.copy_(synced)
for param in list(module.parameters()):
def allreduce_hook(*unused):
param._execution_engine.queue_callback(allreduce_params)
if param.requires_grad:
param.register_hook(allreduce_hook)
def set_needs_reduction(self, input, output):
self.needs_reduction = True
module.register_forward_hook(set_needs_reduction)
return module

View File

@@ -10,7 +10,7 @@ def create_hparams(hparams_string=None, verbose=False):
# Experiment Parameters # # Experiment Parameters #
################################ ################################
epochs=500, epochs=500,
iters_per_checkpoint=1000, iters_per_checkpoint=500,
seed=1234, seed=1234,
dynamic_loss_scaling=True, dynamic_loss_scaling=True,
fp16_run=False, fp16_run=False,
@@ -23,10 +23,10 @@ def create_hparams(hparams_string=None, verbose=False):
################################ ################################
# Data Parameters # # Data Parameters #
################################ ################################
load_mel_from_disk=False, training_files='filelists/ljs_audio_text_train_filelist.txt',
training_files='filelists/ljs_audio22khz_text_train_filelist.txt', validation_files='filelists/ljs_audio_text_val_filelist.txt',
validation_files='filelists/ljs_audio22khz_text_val_filelist.txt',
text_cleaners=['english_cleaners'], text_cleaners=['english_cleaners'],
sort_by_length=False,
################################ ################################
# Audio Parameters # # Audio Parameters #
@@ -38,7 +38,7 @@ def create_hparams(hparams_string=None, verbose=False):
win_length=1024, win_length=1024,
n_mel_channels=80, n_mel_channels=80,
mel_fmin=0.0, mel_fmin=0.0,
mel_fmax=8000.0, mel_fmax=None, # if None, half the sampling rate
################################ ################################
# Model Parameters # # Model Parameters #
@@ -52,13 +52,11 @@ def create_hparams(hparams_string=None, verbose=False):
encoder_embedding_dim=512, encoder_embedding_dim=512,
# Decoder parameters # Decoder parameters
n_frames_per_step=1, # currently only 1 is supported n_frames_per_step=1,
decoder_rnn_dim=1024, decoder_rnn_dim=1024,
prenet_dim=256, prenet_dim=256,
max_decoder_steps=1000, max_decoder_steps=1000,
gate_threshold=0.5, gate_threshold=0.6,
p_attention_dropout=0.1,
p_decoder_dropout=0.1,
# Attention parameters # Attention parameters
attention_rnn_dim=1024, attention_rnn_dim=1024,
@@ -76,12 +74,11 @@ def create_hparams(hparams_string=None, verbose=False):
################################ ################################
# Optimization Hyperparameters # # Optimization Hyperparameters #
################################ ################################
use_saved_learning_rate=False,
learning_rate=1e-3, learning_rate=1e-3,
weight_decay=1e-6, weight_decay=1e-6,
grad_clip_thresh=1.0, grad_clip_thresh=1,
batch_size=64, batch_size=48,
mask_padding=True # set model's padded outputs to padded values mask_padding=False # set model's padded outputs to padded values
) )
if hparams_string: if hparams_string:

113
inference.ipynb Executable file → Normal file

File diff suppressed because one or more lines are too long

View File

@@ -10,7 +10,7 @@ class LinearNorm(torch.nn.Module):
super(LinearNorm, self).__init__() super(LinearNorm, self).__init__()
self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias) self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
torch.nn.init.xavier_uniform_( torch.nn.init.xavier_uniform(
self.linear_layer.weight, self.linear_layer.weight,
gain=torch.nn.init.calculate_gain(w_init_gain)) gain=torch.nn.init.calculate_gain(w_init_gain))
@@ -31,7 +31,7 @@ class ConvNorm(torch.nn.Module):
padding=padding, dilation=dilation, padding=padding, dilation=dilation,
bias=bias) bias=bias)
torch.nn.init.xavier_uniform_( torch.nn.init.xavier_uniform(
self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
def forward(self, signal): def forward(self, signal):
@@ -42,7 +42,7 @@ class ConvNorm(torch.nn.Module):
class TacotronSTFT(torch.nn.Module): class TacotronSTFT(torch.nn.Module):
def __init__(self, filter_length=1024, hop_length=256, win_length=1024, def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
mel_fmax=8000.0): mel_fmax=None):
super(TacotronSTFT, self).__init__() super(TacotronSTFT, self).__init__()
self.n_mel_channels = n_mel_channels self.n_mel_channels = n_mel_channels
self.sampling_rate = sampling_rate self.sampling_rate = sampling_rate

View File

@@ -51,10 +51,11 @@ class DynamicLossScaler:
# `x` is a torch.Tensor # `x` is a torch.Tensor
def _has_inf_or_nan(x): def _has_inf_or_nan(x):
cpu_sum = float(x.float().sum()) inf_count = torch.sum(x.abs() == float('inf'))
if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum: if inf_count > 0:
return True return True
return False nan_count = torch.sum(x != x)
return nan_count > 0
# `overflow` is boolean indicating whether we overflowed in gradient # `overflow` is boolean indicating whether we overflowed in gradient
def update_scale(self, overflow): def update_scale(self, overflow):

View File

@@ -1,4 +1,3 @@
from math import sqrt
import torch import torch
from torch.autograd import Variable from torch.autograd import Variable
from torch import nn from torch import nn
@@ -57,7 +56,7 @@ class Attention(nn.Module):
processed_query = self.query_layer(query.unsqueeze(1)) processed_query = self.query_layer(query.unsqueeze(1))
processed_attention_weights = self.location_layer(attention_weights_cat) processed_attention_weights = self.location_layer(attention_weights_cat)
energies = self.v(torch.tanh( energies = self.v(F.tanh(
processed_query + processed_attention_weights + processed_memory)) processed_query + processed_attention_weights + processed_memory))
energies = energies.squeeze(-1) energies = energies.squeeze(-1)
@@ -108,6 +107,7 @@ class Postnet(nn.Module):
def __init__(self, hparams): def __init__(self, hparams):
super(Postnet, self).__init__() super(Postnet, self).__init__()
self.dropout = nn.Dropout(0.5)
self.convolutions = nn.ModuleList() self.convolutions = nn.ModuleList()
self.convolutions.append( self.convolutions.append(
@@ -141,8 +141,9 @@ class Postnet(nn.Module):
def forward(self, x): def forward(self, x):
for i in range(len(self.convolutions) - 1): for i in range(len(self.convolutions) - 1):
x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training) x = self.dropout(F.tanh(self.convolutions[i](x)))
x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
x = self.dropout(self.convolutions[-1](x))
return x return x
@@ -154,6 +155,7 @@ class Encoder(nn.Module):
""" """
def __init__(self, hparams): def __init__(self, hparams):
super(Encoder, self).__init__() super(Encoder, self).__init__()
self.dropout = nn.Dropout(0.5)
convolutions = [] convolutions = []
for _ in range(hparams.encoder_n_convolutions): for _ in range(hparams.encoder_n_convolutions):
@@ -173,7 +175,7 @@ class Encoder(nn.Module):
def forward(self, x, input_lengths): def forward(self, x, input_lengths):
for conv in self.convolutions: for conv in self.convolutions:
x = F.dropout(F.relu(conv(x)), 0.5, self.training) x = self.dropout(F.relu(conv(x)))
x = x.transpose(1, 2) x = x.transpose(1, 2)
@@ -192,7 +194,7 @@ class Encoder(nn.Module):
def inference(self, x): def inference(self, x):
for conv in self.convolutions: for conv in self.convolutions:
x = F.dropout(F.relu(conv(x)), 0.5, self.training) x = self.dropout(F.relu(conv(x)))
x = x.transpose(1, 2) x = x.transpose(1, 2)
@@ -213,8 +215,6 @@ class Decoder(nn.Module):
self.prenet_dim = hparams.prenet_dim self.prenet_dim = hparams.prenet_dim
self.max_decoder_steps = hparams.max_decoder_steps self.max_decoder_steps = hparams.max_decoder_steps
self.gate_threshold = hparams.gate_threshold self.gate_threshold = hparams.gate_threshold
self.p_attention_dropout = hparams.p_attention_dropout
self.p_decoder_dropout = hparams.p_decoder_dropout
self.prenet = Prenet( self.prenet = Prenet(
hparams.n_mel_channels * hparams.n_frames_per_step, hparams.n_mel_channels * hparams.n_frames_per_step,
@@ -235,7 +235,7 @@ class Decoder(nn.Module):
self.linear_projection = LinearNorm( self.linear_projection = LinearNorm(
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
hparams.n_mel_channels * hparams.n_frames_per_step) hparams.n_mel_channels*hparams.n_frames_per_step)
self.gate_layer = LinearNorm( self.gate_layer = LinearNorm(
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1, hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1,
@@ -350,13 +350,11 @@ class Decoder(nn.Module):
gate_output: gate output energies gate_output: gate output energies
attention_weights: attention_weights:
""" """
decoder_input = self.prenet(decoder_input)
cell_input = torch.cat((decoder_input, self.attention_context), -1) cell_input = torch.cat((decoder_input, self.attention_context), -1)
self.attention_hidden, self.attention_cell = self.attention_rnn( self.attention_hidden, self.attention_cell = self.attention_rnn(
cell_input, (self.attention_hidden, self.attention_cell)) cell_input, (self.attention_hidden, self.attention_cell))
self.attention_hidden = F.dropout(
self.attention_hidden, self.p_attention_dropout, self.training)
self.attention_cell = F.dropout(
self.attention_cell, self.p_attention_dropout, self.training)
attention_weights_cat = torch.cat( attention_weights_cat = torch.cat(
(self.attention_weights.unsqueeze(1), (self.attention_weights.unsqueeze(1),
@@ -370,10 +368,6 @@ class Decoder(nn.Module):
(self.attention_hidden, self.attention_context), -1) (self.attention_hidden, self.attention_context), -1)
self.decoder_hidden, self.decoder_cell = self.decoder_rnn( self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
decoder_input, (self.decoder_hidden, self.decoder_cell)) decoder_input, (self.decoder_hidden, self.decoder_cell))
self.decoder_hidden = F.dropout(
self.decoder_hidden, self.p_decoder_dropout, self.training)
self.decoder_cell = F.dropout(
self.decoder_cell, self.p_decoder_dropout, self.training)
decoder_hidden_attention_context = torch.cat( decoder_hidden_attention_context = torch.cat(
(self.decoder_hidden, self.attention_context), dim=1) (self.decoder_hidden, self.attention_context), dim=1)
@@ -398,23 +392,23 @@ class Decoder(nn.Module):
alignments: sequence of attention weights from the decoder alignments: sequence of attention weights from the decoder
""" """
decoder_input = self.get_go_frame(memory).unsqueeze(0) decoder_input = self.get_go_frame(memory)
decoder_inputs = self.parse_decoder_inputs(decoder_inputs) decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
decoder_inputs = self.prenet(decoder_inputs)
self.initialize_decoder_states( self.initialize_decoder_states(
memory, mask=~get_mask_from_lengths(memory_lengths)) memory, mask=~get_mask_from_lengths(memory_lengths))
mel_outputs, gate_outputs, alignments = [], [], [] mel_outputs, gate_outputs, alignments = [], [], []
while len(mel_outputs) < decoder_inputs.size(0) - 1:
decoder_input = decoder_inputs[len(mel_outputs)] while len(mel_outputs) < decoder_inputs.size(0):
mel_output, gate_output, attention_weights = self.decode( mel_output, gate_output, attention_weights = self.decode(
decoder_input) decoder_input)
mel_outputs += [mel_output.squeeze(1)] mel_outputs += [mel_output.squeeze(1)]
gate_outputs += [gate_output.squeeze()] gate_outputs += [gate_output.squeeze()]
alignments += [attention_weights] alignments += [attention_weights]
decoder_input = decoder_inputs[len(mel_outputs) - 1]
mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs( mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
mel_outputs, gate_outputs, alignments) mel_outputs, gate_outputs, alignments)
@@ -437,15 +431,15 @@ class Decoder(nn.Module):
self.initialize_decoder_states(memory, mask=None) self.initialize_decoder_states(memory, mask=None)
mel_outputs, gate_outputs, alignments = [], [], [] mel_outputs, gate_outputs, alignments = [], [], []
while True: while True:
decoder_input = self.prenet(decoder_input)
mel_output, gate_output, alignment = self.decode(decoder_input) mel_output, gate_output, alignment = self.decode(decoder_input)
mel_outputs += [mel_output.squeeze(1)] mel_outputs += [mel_output.squeeze(1)]
gate_outputs += [gate_output] gate_outputs += [gate_output.squeeze()]
alignments += [alignment] alignments += [alignment]
if torch.sigmoid(gate_output.data) > self.gate_threshold: if F.sigmoid(gate_output.data) > self.gate_threshold:
break break
elif len(mel_outputs) == self.max_decoder_steps: elif len(mel_outputs) == self.max_decoder_steps:
print("Warning! Reached max decoder steps") print("Warning! Reached max decoder steps")
@@ -468,9 +462,6 @@ class Tacotron2(nn.Module):
self.n_frames_per_step = hparams.n_frames_per_step self.n_frames_per_step = hparams.n_frames_per_step
self.embedding = nn.Embedding( self.embedding = nn.Embedding(
hparams.n_symbols, hparams.symbols_embedding_dim) hparams.n_symbols, hparams.symbols_embedding_dim)
std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))
val = sqrt(3.0) * std # uniform bounds for std
self.embedding.weight.data.uniform_(-val, val)
self.encoder = Encoder(hparams) self.encoder = Encoder(hparams)
self.decoder = Decoder(hparams) self.decoder = Decoder(hparams)
self.postnet = Postnet(hparams) self.postnet = Postnet(hparams)
@@ -480,7 +471,7 @@ class Tacotron2(nn.Module):
output_lengths = batch output_lengths = batch
text_padded = to_gpu(text_padded).long() text_padded = to_gpu(text_padded).long()
input_lengths = to_gpu(input_lengths).long() input_lengths = to_gpu(input_lengths).long()
max_len = torch.max(input_lengths.data).item() max_len = torch.max(input_lengths.data)
mel_padded = to_gpu(mel_padded).float() mel_padded = to_gpu(mel_padded).float()
gate_padded = to_gpu(gate_padded).float() gate_padded = to_gpu(gate_padded).float()
output_lengths = to_gpu(output_lengths).long() output_lengths = to_gpu(output_lengths).long()
@@ -495,7 +486,7 @@ class Tacotron2(nn.Module):
def parse_output(self, outputs, output_lengths=None): def parse_output(self, outputs, output_lengths=None):
if self.mask_padding and output_lengths is not None: if self.mask_padding and output_lengths is not None:
mask = ~get_mask_from_lengths(output_lengths) mask = ~get_mask_from_lengths(output_lengths+1) # +1 <stop> token
mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1)) mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
mask = mask.permute(1, 0, 2) mask = mask.permute(1, 0, 2)
@@ -504,6 +495,7 @@ class Tacotron2(nn.Module):
outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies
outputs = fp16_to_fp32(outputs) if self.fp16_run else outputs outputs = fp16_to_fp32(outputs) if self.fp16_run else outputs
return outputs return outputs
def forward(self, inputs): def forward(self, inputs):
@@ -521,6 +513,14 @@ class Tacotron2(nn.Module):
mel_outputs_postnet = self.postnet(mel_outputs) mel_outputs_postnet = self.postnet(mel_outputs)
mel_outputs_postnet = mel_outputs + mel_outputs_postnet mel_outputs_postnet = mel_outputs + mel_outputs_postnet
# DataParallel expects equal sized inputs/outputs, hence padding
if input_lengths is not None:
alignments = alignments.unsqueeze(0)
alignments = nn.functional.pad(
alignments,
(0, max_len - alignments.size(3), 0, 0),
"constant", 0)
alignments = alignments.squeeze()
return self.parse_output( return self.parse_output(
[mel_outputs, mel_outputs_postnet, gate_outputs, alignments], [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
output_lengths) output_lengths)

View File

@@ -1,10 +1,9 @@
torch==0.4.0 torch==0.2.0.post3
matplotlib==2.1.0 matplotlib==2.1.0
tensorflow tensorflow==1.5.0
numpy==1.13.3 numpy==1.13.3
inflect==0.2.5 inflect==0.2.5
librosa==0.6.0 librosa==0.6.0
scipy==1.0.0 scipy==1.0.0
tensorboardX==1.1 tensorboardX==1.1
Unidecode==1.0.22 Unidecode==1.0.22
pillow

View File

@@ -61,7 +61,7 @@ class STFT(torch.nn.Module):
np.linalg.pinv(scale * fourier_basis).T[:, None, :]) np.linalg.pinv(scale * fourier_basis).T[:, None, :])
if window is not None: if window is not None:
assert(filter_length >= win_length) assert(win_length >= filter_length)
# get window and zero center pad it to filter_length # get window and zero center pad it to filter_length
fft_window = get_window(window, win_length, fftbins=True) fft_window = get_window(window, win_length, fftbins=True)
fft_window = pad_center(fft_window, filter_length) fft_window = pad_center(fft_window, filter_length)

View File

@@ -37,6 +37,8 @@ def text_to_sequence(text, cleaner_names):
sequence += _arpabet_to_sequence(m.group(2)) sequence += _arpabet_to_sequence(m.group(2))
text = m.group(3) text = m.group(3)
# Append EOS token
sequence.append(_symbol_to_id['~'])
return sequence return sequence

View File

@@ -7,12 +7,11 @@ The default is a set of ASCII characters that works well for English or text tha
from text import cmudict from text import cmudict
_pad = '_' _pad = '_'
_punctuation = '!\'(),.:;? ' _eos = '~'
_special = '-' _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
_arpabet = ['@' + s for s in cmudict.valid_symbols] _arpabet = ['@' + s for s in cmudict.valid_symbols]
# Export all symbols: # Export all symbols:
symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet symbols = [_pad, _eos] + list(_characters) + _arpabet

View File

@@ -2,12 +2,11 @@ import os
import time import time
import argparse import argparse
import math import math
from numpy import finfo
import torch import torch
from distributed import apply_gradient_allreduce from distributed import DistributedDataParallel
import torch.distributed as dist
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
from torch.nn import DataParallel
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from fp16_optimizer import FP16_Optimizer from fp16_optimizer import FP16_Optimizer
@@ -30,20 +29,19 @@ def batchnorm_to_float(module):
def reduce_tensor(tensor, num_gpus): def reduce_tensor(tensor, num_gpus):
rt = tensor.clone() rt = tensor.clone()
dist.all_reduce(rt, op=dist.reduce_op.SUM) torch.distributed.all_reduce(rt, op=torch.distributed.reduce_op.SUM)
rt /= num_gpus rt /= num_gpus
return rt return rt
def init_distributed(hparams, n_gpus, rank, group_name): def init_distributed(hparams, n_gpus, rank, group_name):
assert torch.cuda.is_available(), "Distributed mode requires CUDA." assert torch.cuda.is_available(), "Distributed mode requires CUDA."
print("Initializing Distributed") print("Initializing distributed")
# Set cuda device so everything is done on the right GPU. # Set cuda device so everything is done on the right GPU.
torch.cuda.set_device(rank % torch.cuda.device_count()) torch.cuda.set_device(rank % torch.cuda.device_count())
# Initialize distributed communication # Initialize distributed communication
dist.init_process_group( torch.distributed.init_process_group(
backend=hparams.dist_backend, init_method=hparams.dist_url, backend=hparams.dist_backend, init_method=hparams.dist_url,
world_size=n_gpus, rank=rank, group_name=group_name) world_size=n_gpus, rank=rank, group_name=group_name)
@@ -79,9 +77,7 @@ def prepare_directories_and_logger(output_directory, log_directory, rank):
def load_model(hparams): def load_model(hparams):
model = Tacotron2(hparams).cuda() model = Tacotron2(hparams).cuda()
if hparams.fp16_run: model = batchnorm_to_float(model.half()) if hparams.fp16_run else model
model = batchnorm_to_float(model.half())
model.decoder.attention_layer.score_mask_value = float(finfo('float16').min)
if hparams.distributed_run: if hparams.distributed_run:
model = DistributedDataParallel(model) model = DistributedDataParallel(model)
@@ -132,20 +128,22 @@ def validate(model, criterion, valset, iteration, batch_size, n_gpus,
pin_memory=False, collate_fn=collate_fn) pin_memory=False, collate_fn=collate_fn)
val_loss = 0.0 val_loss = 0.0
if distributed_run or torch.cuda.device_count() > 1:
batch_parser = model.module.parse_batch
else:
batch_parser = model.parse_batch
for i, batch in enumerate(val_loader): for i, batch in enumerate(val_loader):
x, y = model.parse_batch(batch) x, y = batch_parser(batch)
y_pred = model(x) y_pred = model(x)
loss = criterion(y_pred, y) loss = criterion(y_pred, y)
if distributed_run: reduced_val_loss = reduce_tensor(loss.data, n_gpus)[0] \
reduced_val_loss = reduce_tensor(loss.data, num_gpus).item() if distributed_run else loss.data[0]
else:
reduced_val_loss = loss.item()
val_loss += reduced_val_loss val_loss += reduced_val_loss
val_loss = val_loss / (i + 1) val_loss = val_loss / (i + 1)
model.train() model.train()
print("Validation loss {}: {:9f} ".format(iteration, reduced_val_loss)) return val_loss
logger.log_validation(reduced_val_loss, model, y, y_pred, iteration)
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
@@ -175,9 +173,6 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
optimizer = FP16_Optimizer( optimizer = FP16_Optimizer(
optimizer, dynamic_loss_scale=hparams.dynamic_loss_scaling) optimizer, dynamic_loss_scale=hparams.dynamic_loss_scaling)
if hparams.distributed_run:
model = apply_gradient_allreduce(model)
criterion = Tacotron2Loss() criterion = Tacotron2Loss()
logger = prepare_directories_and_logger( logger = prepare_directories_and_logger(
@@ -192,14 +187,16 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
if warm_start: if warm_start:
model = warm_start_model(checkpoint_path, model) model = warm_start_model(checkpoint_path, model)
else: else:
model, optimizer, _learning_rate, iteration = load_checkpoint( model, optimizer, learning_rate, iteration = load_checkpoint(
checkpoint_path, model, optimizer) checkpoint_path, model, optimizer)
if hparams.use_saved_learning_rate:
learning_rate = _learning_rate
iteration += 1 # next iteration is iteration + 1 iteration += 1 # next iteration is iteration + 1
epoch_offset = max(0, int(iteration / len(train_loader))) epoch_offset = max(0, int(iteration / len(train_loader)))
model.train() model.train()
if hparams.distributed_run or torch.cuda.device_count() > 1:
batch_parser = model.module.parse_batch
else:
batch_parser = model.parse_batch
# ================ MAIN TRAINNIG LOOP! =================== # ================ MAIN TRAINNIG LOOP! ===================
for epoch in range(epoch_offset, hparams.epochs): for epoch in range(epoch_offset, hparams.epochs):
print("Epoch: {}".format(epoch)) print("Epoch: {}".format(epoch))
@@ -209,21 +206,18 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
param_group['lr'] = learning_rate param_group['lr'] = learning_rate
model.zero_grad() model.zero_grad()
x, y = model.parse_batch(batch) x, y = batch_parser(batch)
y_pred = model(x) y_pred = model(x)
loss = criterion(y_pred, y) loss = criterion(y_pred, y)
if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus)[0] \
reduced_loss = reduce_tensor(loss.data, num_gpus).item() if hparams.distributed_run else loss.data[0]
else:
reduced_loss = loss.item()
if hparams.fp16_run: if hparams.fp16_run:
optimizer.backward(loss) optimizer.backward(loss)
grad_norm = optimizer.clip_fp32_grads(hparams.grad_clip_thresh) grad_norm = optimizer.clip_fp32_grads(hparams.grad_clip_thresh)
else: else:
loss.backward() loss.backward()
grad_norm = torch.nn.utils.clip_grad_norm_( grad_norm = torch.nn.utils.clip_grad_norm(
model.parameters(), hparams.grad_clip_thresh) model.parameters(), hparams.grad_clip_thresh)
optimizer.step() optimizer.step()
@@ -234,14 +228,20 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
duration = time.perf_counter() - start duration = time.perf_counter() - start
print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
iteration, reduced_loss, grad_norm, duration)) iteration, reduced_loss, grad_norm, duration))
logger.log_training( logger.log_training(
reduced_loss, grad_norm, learning_rate, duration, iteration) reduced_loss, grad_norm, learning_rate, duration, iteration)
if not overflow and (iteration % hparams.iters_per_checkpoint == 0): if not overflow and (iteration % hparams.iters_per_checkpoint == 0):
validate(model, criterion, valset, iteration, hparams.batch_size, reduced_val_loss = validate(
model, criterion, valset, iteration, hparams.batch_size,
n_gpus, collate_fn, logger, hparams.distributed_run, rank) n_gpus, collate_fn, logger, hparams.distributed_run, rank)
if rank == 0: if rank == 0:
print("Validation loss {}: {:9f} ".format(
iteration, reduced_val_loss))
logger.log_validation(
reduced_val_loss, model, y, y_pred, iteration)
checkpoint_path = os.path.join( checkpoint_path = os.path.join(
output_directory, "checkpoint_{}".format(iteration)) output_directory, "checkpoint_{}".format(iteration))
save_checkpoint(model, optimizer, learning_rate, iteration, save_checkpoint(model, optimizer, learning_rate, iteration,
@@ -276,7 +276,7 @@ if __name__ == '__main__':
torch.backends.cudnn.benchmark = hparams.cudnn_benchmark torch.backends.cudnn.benchmark = hparams.cudnn_benchmark
print("FP16 Run:", hparams.fp16_run) print("FP16 Run:", hparams.fp16_run)
print("Dynamic Loss Scaling:", hparams.dynamic_loss_scaling) print("Dynamic Loss Scaling", hparams.dynamic_loss_scaling)
print("Distributed Run:", hparams.distributed_run) print("Distributed Run:", hparams.distributed_run)
print("cuDNN Enabled:", hparams.cudnn_enabled) print("cuDNN Enabled:", hparams.cudnn_enabled)
print("cuDNN Benchmark:", hparams.cudnn_benchmark) print("cuDNN Benchmark:", hparams.cudnn_benchmark)

View File

@@ -4,26 +4,29 @@ import torch
def get_mask_from_lengths(lengths): def get_mask_from_lengths(lengths):
max_len = torch.max(lengths).item() max_len = torch.max(lengths)
ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len)) ids = torch.arange(0, max_len, out=torch.LongTensor(max_len)).cuda()
mask = (ids < lengths.unsqueeze(1)).byte() mask = (ids < lengths.unsqueeze(1)).byte()
return mask return mask
def load_wav_to_torch(full_path): def load_wav_to_torch(full_path, sr):
sampling_rate, data = read(full_path) sampling_rate, data = read(full_path)
return torch.FloatTensor(data.astype(np.float32)), sampling_rate assert sr == sampling_rate, "{} SR doesn't match {} on path {}".format(
sr, sampling_rate, full_path)
return torch.FloatTensor(data.astype(np.float32))
def load_filepaths_and_text(filename, split="|"): def load_filepaths_and_text(filename, sort_by_length, split="|"):
with open(filename, encoding='utf-8') as f: with open(filename, encoding='utf-8') as f:
filepaths_and_text = [line.strip().split(split) for line in f] filepaths_and_text = [line.strip().split(split) for line in f]
if sort_by_length:
filepaths_and_text.sort(key=lambda x: len(x[1]))
return filepaths_and_text return filepaths_and_text
def to_gpu(x): def to_gpu(x):
x = x.contiguous() x = x.contiguous().cuda(async=True)
if torch.cuda.is_available():
x = x.cuda(non_blocking=True)
return torch.autograd.Variable(x) return torch.autograd.Variable(x)

Submodule waveglow deleted from 4b1001fa33