mirror of https://github.com/malarinv/tacotron2
commit
f02704f338
|
|
@ -1,6 +1,6 @@
|
||||||
# Tacotron 2 (without wavenet)
|
# Tacotron 2 (without wavenet)
|
||||||
|
|
||||||
Tacotron 2 PyTorch implementation of [Natural TTS Synthesis By Conditioning
|
PyTorch implementation of [Natural TTS Synthesis By Conditioning
|
||||||
Wavenet On Mel Spectrogram Predictions](https://arxiv.org/pdf/1712.05884.pdf).
|
Wavenet On Mel Spectrogram Predictions](https://arxiv.org/pdf/1712.05884.pdf).
|
||||||
|
|
||||||
This implementation includes **distributed** and **fp16** support
|
This implementation includes **distributed** and **fp16** support
|
||||||
|
|
@ -11,9 +11,7 @@ Distributed and FP16 support relies on work by Christian Sarofeen and NVIDIA's
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
[Download demo audio](https://github.com/NVIDIA/tacotron2/blob/master/demo.wav) trained on LJS and using Ryuchi Yamamoto's [pre-trained Mixture of Logistics
|
Visit our [website] for audio samples.
|
||||||
wavenet](https://github.com/r9y9/wavenet_vocoder/)
|
|
||||||
"Scientists at the CERN laboratory say they have discovered a new particle."
|
|
||||||
|
|
||||||
## Pre-requisites
|
## Pre-requisites
|
||||||
1. NVIDIA GPU + CUDA cuDNN
|
1. NVIDIA GPU + CUDA cuDNN
|
||||||
|
|
@ -24,11 +22,9 @@ wavenet](https://github.com/r9y9/wavenet_vocoder/)
|
||||||
3. CD into this repo: `cd tacotron2`
|
3. CD into this repo: `cd tacotron2`
|
||||||
4. Update .wav paths: `sed -i -- 's,DUMMY,ljs_dataset_folder/wavs,g' filelists/*.txt`
|
4. Update .wav paths: `sed -i -- 's,DUMMY,ljs_dataset_folder/wavs,g' filelists/*.txt`
|
||||||
- Alternatively, set `load_mel_from_disk=True` in `hparams.py` and update mel-spectrogram paths
|
- Alternatively, set `load_mel_from_disk=True` in `hparams.py` and update mel-spectrogram paths
|
||||||
5. Install [pytorch 0.4](https://github.com/pytorch/pytorch)
|
5. Install [PyTorch 1.0]
|
||||||
6. Install python requirements or build docker image
|
6. Install python requirements or build docker image
|
||||||
- Install python requirements: `pip install -r requirements.txt`
|
- Install python requirements: `pip install -r requirements.txt`
|
||||||
- **OR**
|
|
||||||
- Build docker image: `docker build --tag tacotron2 .`
|
|
||||||
|
|
||||||
## Training
|
## Training
|
||||||
1. `python train.py --output_directory=outdir --log_directory=logdir`
|
1. `python train.py --output_directory=outdir --log_directory=logdir`
|
||||||
|
|
@ -37,17 +33,22 @@ wavenet](https://github.com/r9y9/wavenet_vocoder/)
|
||||||
## Multi-GPU (distributed) and FP16 Training
|
## Multi-GPU (distributed) and FP16 Training
|
||||||
1. `python -m multiproc train.py --output_directory=outdir --log_directory=logdir --hparams=distributed_run=True,fp16_run=True`
|
1. `python -m multiproc train.py --output_directory=outdir --log_directory=logdir --hparams=distributed_run=True,fp16_run=True`
|
||||||
|
|
||||||
## Inference
|
## Inference demo
|
||||||
When performing Mel-Spectrogram to Audio synthesis with a WaveNet model, make sure Tacotron 2 and WaveNet were trained on the same mel-spectrogram representation. Follow these steps to use a a simple inference pipeline using griffin-lim:
|
1. Download our published [Tacotron 2] model
|
||||||
|
2. Download our published [WaveGlow] model
|
||||||
1. `jupyter notebook --ip=127.0.0.1 --port=31337`
|
3. `jupyter notebook --ip=127.0.0.1 --port=31337`
|
||||||
2. load inference.ipynb
|
4. Load inference.ipynb
|
||||||
|
|
||||||
|
N.b. When performing Mel-Spectrogram to Audio synthesis, make sure Tacotron 2
|
||||||
|
and the Mel decoder were trained on the same mel-spectrogram representation.
|
||||||
|
|
||||||
|
|
||||||
## Related repos
|
## Related repos
|
||||||
[nv-wavenet](https://github.com/NVIDIA/nv-wavenet/): Faster than real-time
|
[WaveGlow](https://github.com/NVIDIA/WaveGlow) Faster than real time Flow-based
|
||||||
wavenet inference
|
Generative Network for Speech Synthesis
|
||||||
|
|
||||||
|
[nv-wavenet](https://github.com/NVIDIA/nv-wavenet/) Faster than real time
|
||||||
|
WaveNet.
|
||||||
|
|
||||||
## Acknowledgements
|
## Acknowledgements
|
||||||
This implementation uses code from the following repos: [Keith
|
This implementation uses code from the following repos: [Keith
|
||||||
|
|
@ -61,3 +62,7 @@ We are thankful to the Tacotron 2 paper authors, specially Jonathan Shen, Yuxuan
|
||||||
Wang and Zongheng Yang.
|
Wang and Zongheng Yang.
|
||||||
|
|
||||||
|
|
||||||
|
[WaveGlow]: https://drive.google.com/file/d/1cjKPHbtAMh_4HTHmuIGNkbOkPBD9qwhj/view?usp=sharing
|
||||||
|
[Tacotron 2]: https://drive.google.com/file/d/1c5ZTuT7J08wLUoVZ2KkUs_VdZuJ86ZqA/view?usp=sharing
|
||||||
|
[pytorch 1.0]: https://github.com/pytorch/pytorch#installation
|
||||||
|
[website]: https://nv-adlr.github.io/WaveGlow
|
||||||
|
|
|
||||||
|
|
@ -14,9 +14,8 @@ class TextMelLoader(torch.utils.data.Dataset):
|
||||||
2) normalizes text and converts them to sequences of one-hot vectors
|
2) normalizes text and converts them to sequences of one-hot vectors
|
||||||
3) computes mel-spectrograms from audio files.
|
3) computes mel-spectrograms from audio files.
|
||||||
"""
|
"""
|
||||||
def __init__(self, audiopaths_and_text, hparams, shuffle=True):
|
def __init__(self, audiopaths_and_text, hparams):
|
||||||
self.audiopaths_and_text = load_filepaths_and_text(
|
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
|
||||||
audiopaths_and_text, hparams.sort_by_length)
|
|
||||||
self.text_cleaners = hparams.text_cleaners
|
self.text_cleaners = hparams.text_cleaners
|
||||||
self.max_wav_value = hparams.max_wav_value
|
self.max_wav_value = hparams.max_wav_value
|
||||||
self.sampling_rate = hparams.sampling_rate
|
self.sampling_rate = hparams.sampling_rate
|
||||||
|
|
@ -26,7 +25,6 @@ class TextMelLoader(torch.utils.data.Dataset):
|
||||||
hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
|
hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
|
||||||
hparams.mel_fmax)
|
hparams.mel_fmax)
|
||||||
random.seed(1234)
|
random.seed(1234)
|
||||||
if shuffle:
|
|
||||||
random.shuffle(self.audiopaths_and_text)
|
random.shuffle(self.audiopaths_and_text)
|
||||||
|
|
||||||
def get_mel_text_pair(self, audiopath_and_text):
|
def get_mel_text_pair(self, audiopath_and_text):
|
||||||
|
|
@ -38,7 +36,10 @@ class TextMelLoader(torch.utils.data.Dataset):
|
||||||
|
|
||||||
def get_mel(self, filename):
|
def get_mel(self, filename):
|
||||||
if not self.load_mel_from_disk:
|
if not self.load_mel_from_disk:
|
||||||
audio = load_wav_to_torch(filename, self.sampling_rate)
|
audio, sampling_rate = load_wav_to_torch(filename)
|
||||||
|
if sampling_rate != self.stft.sampling_rate:
|
||||||
|
raise ValueError("{} {} SR doesn't match target {} SR".format(
|
||||||
|
sampling_rate, self.stft.sampling_rate))
|
||||||
audio_norm = audio / self.max_wav_value
|
audio_norm = audio / self.max_wav_value
|
||||||
audio_norm = audio_norm.unsqueeze(0)
|
audio_norm = audio_norm.unsqueeze(0)
|
||||||
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
|
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
|
||||||
|
|
@ -87,9 +88,9 @@ class TextMelCollate():
|
||||||
text = batch[ids_sorted_decreasing[i]][0]
|
text = batch[ids_sorted_decreasing[i]][0]
|
||||||
text_padded[i, :text.size(0)] = text
|
text_padded[i, :text.size(0)] = text
|
||||||
|
|
||||||
# Right zero-pad mel-spec with extra single zero vector to mark the end
|
# Right zero-pad mel-spec
|
||||||
num_mels = batch[0][1].size(0)
|
num_mels = batch[0][1].size(0)
|
||||||
max_target_len = max([x[1].size(1) for x in batch]) + 1
|
max_target_len = max([x[1].size(1) for x in batch])
|
||||||
if max_target_len % self.n_frames_per_step != 0:
|
if max_target_len % self.n_frames_per_step != 0:
|
||||||
max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
|
max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
|
||||||
assert max_target_len % self.n_frames_per_step == 0
|
assert max_target_len % self.n_frames_per_step == 0
|
||||||
|
|
@ -103,7 +104,7 @@ class TextMelCollate():
|
||||||
for i in range(len(ids_sorted_decreasing)):
|
for i in range(len(ids_sorted_decreasing)):
|
||||||
mel = batch[ids_sorted_decreasing[i]][1]
|
mel = batch[ids_sorted_decreasing[i]][1]
|
||||||
mel_padded[i, :, :mel.size(1)] = mel
|
mel_padded[i, :, :mel.size(1)] = mel
|
||||||
gate_padded[i, mel.size(1):] = 1
|
gate_padded[i, mel.size(1)-1:] = 1
|
||||||
output_lengths[i] = mel.size(1)
|
output_lengths[i] = mel.size(1)
|
||||||
|
|
||||||
return text_padded, input_lengths, mel_padded, gate_padded, \
|
return text_padded, input_lengths, mel_padded, gate_padded, \
|
||||||
|
|
|
||||||
|
|
@ -118,3 +118,55 @@ class DistributedDataParallel(Module):
|
||||||
super(DistributedDataParallel, self).train(mode)
|
super(DistributedDataParallel, self).train(mode)
|
||||||
self.module.train(mode)
|
self.module.train(mode)
|
||||||
'''
|
'''
|
||||||
|
'''
|
||||||
|
Modifies existing model to do gradient allreduce, but doesn't change class
|
||||||
|
so you don't need "module"
|
||||||
|
'''
|
||||||
|
def apply_gradient_allreduce(module):
|
||||||
|
if not hasattr(dist, '_backend'):
|
||||||
|
module.warn_on_half = True
|
||||||
|
else:
|
||||||
|
module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
|
||||||
|
|
||||||
|
for p in module.state_dict().values():
|
||||||
|
if not torch.is_tensor(p):
|
||||||
|
continue
|
||||||
|
dist.broadcast(p, 0)
|
||||||
|
|
||||||
|
def allreduce_params():
|
||||||
|
if(module.needs_reduction):
|
||||||
|
module.needs_reduction = False
|
||||||
|
buckets = {}
|
||||||
|
for param in module.parameters():
|
||||||
|
if param.requires_grad and param.grad is not None:
|
||||||
|
tp = type(param.data)
|
||||||
|
if tp not in buckets:
|
||||||
|
buckets[tp] = []
|
||||||
|
buckets[tp].append(param)
|
||||||
|
if module.warn_on_half:
|
||||||
|
if torch.cuda.HalfTensor in buckets:
|
||||||
|
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
|
||||||
|
" It is recommended to use the NCCL backend in this case. This currently requires" +
|
||||||
|
"PyTorch built from top of tree master.")
|
||||||
|
module.warn_on_half = False
|
||||||
|
|
||||||
|
for tp in buckets:
|
||||||
|
bucket = buckets[tp]
|
||||||
|
grads = [param.grad.data for param in bucket]
|
||||||
|
coalesced = _flatten_dense_tensors(grads)
|
||||||
|
dist.all_reduce(coalesced)
|
||||||
|
coalesced /= dist.get_world_size()
|
||||||
|
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
|
||||||
|
buf.copy_(synced)
|
||||||
|
|
||||||
|
for param in list(module.parameters()):
|
||||||
|
def allreduce_hook(*unused):
|
||||||
|
param._execution_engine.queue_callback(allreduce_params)
|
||||||
|
if param.requires_grad:
|
||||||
|
param.register_hook(allreduce_hook)
|
||||||
|
|
||||||
|
def set_needs_reduction(self, input, output):
|
||||||
|
self.needs_reduction = True
|
||||||
|
|
||||||
|
module.register_forward_hook(set_needs_reduction)
|
||||||
|
return module
|
||||||
|
|
|
||||||
19
hparams.py
19
hparams.py
|
|
@ -10,7 +10,7 @@ def create_hparams(hparams_string=None, verbose=False):
|
||||||
# Experiment Parameters #
|
# Experiment Parameters #
|
||||||
################################
|
################################
|
||||||
epochs=500,
|
epochs=500,
|
||||||
iters_per_checkpoint=500,
|
iters_per_checkpoint=1000,
|
||||||
seed=1234,
|
seed=1234,
|
||||||
dynamic_loss_scaling=True,
|
dynamic_loss_scaling=True,
|
||||||
fp16_run=False,
|
fp16_run=False,
|
||||||
|
|
@ -24,10 +24,9 @@ def create_hparams(hparams_string=None, verbose=False):
|
||||||
# Data Parameters #
|
# Data Parameters #
|
||||||
################################
|
################################
|
||||||
load_mel_from_disk=False,
|
load_mel_from_disk=False,
|
||||||
training_files='filelists/ljs_audio_text_train_filelist.txt',
|
training_files='filelists/ljs_audio22khz_text_train_filelist.txt',
|
||||||
validation_files='filelists/ljs_audio_text_val_filelist.txt',
|
validation_files='filelists/ljs_audio22khz_text_val_filelist.txt',
|
||||||
text_cleaners=['english_cleaners'],
|
text_cleaners=['english_cleaners'],
|
||||||
sort_by_length=False,
|
|
||||||
|
|
||||||
################################
|
################################
|
||||||
# Audio Parameters #
|
# Audio Parameters #
|
||||||
|
|
@ -39,7 +38,7 @@ def create_hparams(hparams_string=None, verbose=False):
|
||||||
win_length=1024,
|
win_length=1024,
|
||||||
n_mel_channels=80,
|
n_mel_channels=80,
|
||||||
mel_fmin=0.0,
|
mel_fmin=0.0,
|
||||||
mel_fmax=None, # if None, half the sampling rate
|
mel_fmax=8000.0,
|
||||||
|
|
||||||
################################
|
################################
|
||||||
# Model Parameters #
|
# Model Parameters #
|
||||||
|
|
@ -57,7 +56,9 @@ def create_hparams(hparams_string=None, verbose=False):
|
||||||
decoder_rnn_dim=1024,
|
decoder_rnn_dim=1024,
|
||||||
prenet_dim=256,
|
prenet_dim=256,
|
||||||
max_decoder_steps=1000,
|
max_decoder_steps=1000,
|
||||||
gate_threshold=0.6,
|
gate_threshold=0.5,
|
||||||
|
p_attention_dropout=0.1,
|
||||||
|
p_decoder_dropout=0.1,
|
||||||
|
|
||||||
# Attention parameters
|
# Attention parameters
|
||||||
attention_rnn_dim=1024,
|
attention_rnn_dim=1024,
|
||||||
|
|
@ -78,9 +79,9 @@ def create_hparams(hparams_string=None, verbose=False):
|
||||||
use_saved_learning_rate=False,
|
use_saved_learning_rate=False,
|
||||||
learning_rate=1e-3,
|
learning_rate=1e-3,
|
||||||
weight_decay=1e-6,
|
weight_decay=1e-6,
|
||||||
grad_clip_thresh=1,
|
grad_clip_thresh=1.0,
|
||||||
batch_size=48,
|
batch_size=64,
|
||||||
mask_padding=False # set model's padded outputs to padded values
|
mask_padding=True # set model's padded outputs to padded values
|
||||||
)
|
)
|
||||||
|
|
||||||
if hparams_string:
|
if hparams_string:
|
||||||
|
|
|
||||||
117
inference.ipynb
117
inference.ipynb
File diff suppressed because one or more lines are too long
|
|
@ -10,7 +10,7 @@ class LinearNorm(torch.nn.Module):
|
||||||
super(LinearNorm, self).__init__()
|
super(LinearNorm, self).__init__()
|
||||||
self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
|
self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
|
||||||
|
|
||||||
torch.nn.init.xavier_uniform(
|
torch.nn.init.xavier_uniform_(
|
||||||
self.linear_layer.weight,
|
self.linear_layer.weight,
|
||||||
gain=torch.nn.init.calculate_gain(w_init_gain))
|
gain=torch.nn.init.calculate_gain(w_init_gain))
|
||||||
|
|
||||||
|
|
@ -31,7 +31,7 @@ class ConvNorm(torch.nn.Module):
|
||||||
padding=padding, dilation=dilation,
|
padding=padding, dilation=dilation,
|
||||||
bias=bias)
|
bias=bias)
|
||||||
|
|
||||||
torch.nn.init.xavier_uniform(
|
torch.nn.init.xavier_uniform_(
|
||||||
self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
|
self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
|
||||||
|
|
||||||
def forward(self, signal):
|
def forward(self, signal):
|
||||||
|
|
@ -42,7 +42,7 @@ class ConvNorm(torch.nn.Module):
|
||||||
class TacotronSTFT(torch.nn.Module):
|
class TacotronSTFT(torch.nn.Module):
|
||||||
def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
|
def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
|
||||||
n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
|
n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
|
||||||
mel_fmax=None):
|
mel_fmax=8000.0):
|
||||||
super(TacotronSTFT, self).__init__()
|
super(TacotronSTFT, self).__init__()
|
||||||
self.n_mel_channels = n_mel_channels
|
self.n_mel_channels = n_mel_channels
|
||||||
self.sampling_rate = sampling_rate
|
self.sampling_rate = sampling_rate
|
||||||
|
|
|
||||||
77
model.py
77
model.py
|
|
@ -1,3 +1,4 @@
|
||||||
|
from math import sqrt
|
||||||
import torch
|
import torch
|
||||||
from torch.autograd import Variable
|
from torch.autograd import Variable
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
@ -56,7 +57,7 @@ class Attention(nn.Module):
|
||||||
|
|
||||||
processed_query = self.query_layer(query.unsqueeze(1))
|
processed_query = self.query_layer(query.unsqueeze(1))
|
||||||
processed_attention_weights = self.location_layer(attention_weights_cat)
|
processed_attention_weights = self.location_layer(attention_weights_cat)
|
||||||
energies = self.v(F.tanh(
|
energies = self.v(torch.tanh(
|
||||||
processed_query + processed_attention_weights + processed_memory))
|
processed_query + processed_attention_weights + processed_memory))
|
||||||
|
|
||||||
energies = energies.squeeze(-1)
|
energies = energies.squeeze(-1)
|
||||||
|
|
@ -107,7 +108,6 @@ class Postnet(nn.Module):
|
||||||
|
|
||||||
def __init__(self, hparams):
|
def __init__(self, hparams):
|
||||||
super(Postnet, self).__init__()
|
super(Postnet, self).__init__()
|
||||||
self.dropout = nn.Dropout(0.5)
|
|
||||||
self.convolutions = nn.ModuleList()
|
self.convolutions = nn.ModuleList()
|
||||||
|
|
||||||
self.convolutions.append(
|
self.convolutions.append(
|
||||||
|
|
@ -141,9 +141,8 @@ class Postnet(nn.Module):
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
for i in range(len(self.convolutions) - 1):
|
for i in range(len(self.convolutions) - 1):
|
||||||
x = self.dropout(F.tanh(self.convolutions[i](x)))
|
x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training)
|
||||||
|
x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
|
||||||
x = self.dropout(self.convolutions[-1](x))
|
|
||||||
|
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
@ -155,7 +154,6 @@ class Encoder(nn.Module):
|
||||||
"""
|
"""
|
||||||
def __init__(self, hparams):
|
def __init__(self, hparams):
|
||||||
super(Encoder, self).__init__()
|
super(Encoder, self).__init__()
|
||||||
self.dropout = nn.Dropout(0.5)
|
|
||||||
|
|
||||||
convolutions = []
|
convolutions = []
|
||||||
for _ in range(hparams.encoder_n_convolutions):
|
for _ in range(hparams.encoder_n_convolutions):
|
||||||
|
|
@ -175,7 +173,7 @@ class Encoder(nn.Module):
|
||||||
|
|
||||||
def forward(self, x, input_lengths):
|
def forward(self, x, input_lengths):
|
||||||
for conv in self.convolutions:
|
for conv in self.convolutions:
|
||||||
x = self.dropout(F.relu(conv(x)))
|
x = F.dropout(F.relu(conv(x)), 0.5, self.training)
|
||||||
|
|
||||||
x = x.transpose(1, 2)
|
x = x.transpose(1, 2)
|
||||||
|
|
||||||
|
|
@ -194,7 +192,7 @@ class Encoder(nn.Module):
|
||||||
|
|
||||||
def inference(self, x):
|
def inference(self, x):
|
||||||
for conv in self.convolutions:
|
for conv in self.convolutions:
|
||||||
x = self.dropout(F.relu(conv(x)))
|
x = F.dropout(F.relu(conv(x)), 0.5, self.training)
|
||||||
|
|
||||||
x = x.transpose(1, 2)
|
x = x.transpose(1, 2)
|
||||||
|
|
||||||
|
|
@ -215,13 +213,15 @@ class Decoder(nn.Module):
|
||||||
self.prenet_dim = hparams.prenet_dim
|
self.prenet_dim = hparams.prenet_dim
|
||||||
self.max_decoder_steps = hparams.max_decoder_steps
|
self.max_decoder_steps = hparams.max_decoder_steps
|
||||||
self.gate_threshold = hparams.gate_threshold
|
self.gate_threshold = hparams.gate_threshold
|
||||||
|
self.p_attention_dropout = hparams.p_attention_dropout
|
||||||
|
self.p_decoder_dropout = hparams.p_decoder_dropout
|
||||||
|
|
||||||
self.prenet = Prenet(
|
self.prenet = Prenet(
|
||||||
hparams.n_mel_channels * hparams.n_frames_per_step,
|
hparams.n_mel_channels * hparams.n_frames_per_step,
|
||||||
[hparams.prenet_dim, hparams.prenet_dim])
|
[hparams.prenet_dim, hparams.prenet_dim])
|
||||||
|
|
||||||
self.attention_rnn = nn.LSTMCell(
|
self.attention_rnn = nn.LSTMCell(
|
||||||
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
|
hparams.prenet_dim + hparams.encoder_embedding_dim,
|
||||||
hparams.attention_rnn_dim)
|
hparams.attention_rnn_dim)
|
||||||
|
|
||||||
self.attention_layer = Attention(
|
self.attention_layer = Attention(
|
||||||
|
|
@ -230,12 +230,12 @@ class Decoder(nn.Module):
|
||||||
hparams.attention_location_kernel_size)
|
hparams.attention_location_kernel_size)
|
||||||
|
|
||||||
self.decoder_rnn = nn.LSTMCell(
|
self.decoder_rnn = nn.LSTMCell(
|
||||||
hparams.prenet_dim + hparams.encoder_embedding_dim,
|
hparams.attention_rnn_dim + hparams.encoder_embedding_dim,
|
||||||
hparams.decoder_rnn_dim, 1)
|
hparams.decoder_rnn_dim, 1)
|
||||||
|
|
||||||
self.linear_projection = LinearNorm(
|
self.linear_projection = LinearNorm(
|
||||||
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
|
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
|
||||||
hparams.n_mel_channels*hparams.n_frames_per_step)
|
hparams.n_mel_channels * hparams.n_frames_per_step)
|
||||||
|
|
||||||
self.gate_layer = LinearNorm(
|
self.gate_layer = LinearNorm(
|
||||||
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1,
|
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1,
|
||||||
|
|
@ -350,10 +350,13 @@ class Decoder(nn.Module):
|
||||||
gate_output: gate output energies
|
gate_output: gate output energies
|
||||||
attention_weights:
|
attention_weights:
|
||||||
"""
|
"""
|
||||||
|
cell_input = torch.cat((decoder_input, self.attention_context), -1)
|
||||||
cell_input = torch.cat((self.decoder_hidden, self.attention_context), -1)
|
|
||||||
self.attention_hidden, self.attention_cell = self.attention_rnn(
|
self.attention_hidden, self.attention_cell = self.attention_rnn(
|
||||||
cell_input, (self.attention_hidden, self.attention_cell))
|
cell_input, (self.attention_hidden, self.attention_cell))
|
||||||
|
self.attention_hidden = F.dropout(
|
||||||
|
self.attention_hidden, self.p_attention_dropout, self.training)
|
||||||
|
self.attention_cell = F.dropout(
|
||||||
|
self.attention_cell, self.p_attention_dropout, self.training)
|
||||||
|
|
||||||
attention_weights_cat = torch.cat(
|
attention_weights_cat = torch.cat(
|
||||||
(self.attention_weights.unsqueeze(1),
|
(self.attention_weights.unsqueeze(1),
|
||||||
|
|
@ -363,10 +366,14 @@ class Decoder(nn.Module):
|
||||||
attention_weights_cat, self.mask)
|
attention_weights_cat, self.mask)
|
||||||
|
|
||||||
self.attention_weights_cum += self.attention_weights
|
self.attention_weights_cum += self.attention_weights
|
||||||
prenet_output = self.prenet(decoder_input)
|
decoder_input = torch.cat(
|
||||||
decoder_input = torch.cat((prenet_output, self.attention_context), -1)
|
(self.attention_hidden, self.attention_context), -1)
|
||||||
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
|
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
|
||||||
decoder_input, (self.decoder_hidden, self.decoder_cell))
|
decoder_input, (self.decoder_hidden, self.decoder_cell))
|
||||||
|
self.decoder_hidden = F.dropout(
|
||||||
|
self.decoder_hidden, self.p_decoder_dropout, self.training)
|
||||||
|
self.decoder_cell = F.dropout(
|
||||||
|
self.decoder_cell, self.p_decoder_dropout, self.training)
|
||||||
|
|
||||||
decoder_hidden_attention_context = torch.cat(
|
decoder_hidden_attention_context = torch.cat(
|
||||||
(self.decoder_hidden, self.attention_context), dim=1)
|
(self.decoder_hidden, self.attention_context), dim=1)
|
||||||
|
|
@ -391,22 +398,23 @@ class Decoder(nn.Module):
|
||||||
alignments: sequence of attention weights from the decoder
|
alignments: sequence of attention weights from the decoder
|
||||||
"""
|
"""
|
||||||
|
|
||||||
decoder_input = self.get_go_frame(memory)
|
decoder_input = self.get_go_frame(memory).unsqueeze(0)
|
||||||
decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
|
decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
|
||||||
|
decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
|
||||||
|
decoder_inputs = self.prenet(decoder_inputs)
|
||||||
|
|
||||||
self.initialize_decoder_states(
|
self.initialize_decoder_states(
|
||||||
memory, mask=~get_mask_from_lengths(memory_lengths))
|
memory, mask=~get_mask_from_lengths(memory_lengths))
|
||||||
|
|
||||||
mel_outputs, gate_outputs, alignments = [], [], []
|
mel_outputs, gate_outputs, alignments = [], [], []
|
||||||
|
while len(mel_outputs) < decoder_inputs.size(0) - 1:
|
||||||
while len(mel_outputs) < decoder_inputs.size(0):
|
decoder_input = decoder_inputs[len(mel_outputs)]
|
||||||
mel_output, gate_output, attention_weights = self.decode(
|
mel_output, gate_output, attention_weights = self.decode(
|
||||||
decoder_input)
|
decoder_input)
|
||||||
mel_outputs += [mel_output]
|
mel_outputs += [mel_output.squeeze(1)]
|
||||||
gate_outputs += [gate_output.squeeze(1)]
|
gate_outputs += [gate_output.squeeze()]
|
||||||
alignments += [attention_weights]
|
alignments += [attention_weights]
|
||||||
|
|
||||||
decoder_input = decoder_inputs[len(mel_outputs) - 1]
|
|
||||||
|
|
||||||
mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
|
mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
|
||||||
mel_outputs, gate_outputs, alignments)
|
mel_outputs, gate_outputs, alignments)
|
||||||
|
|
||||||
|
|
@ -430,13 +438,14 @@ class Decoder(nn.Module):
|
||||||
|
|
||||||
mel_outputs, gate_outputs, alignments = [], [], []
|
mel_outputs, gate_outputs, alignments = [], [], []
|
||||||
while True:
|
while True:
|
||||||
|
decoder_input = self.prenet(decoder_input)
|
||||||
mel_output, gate_output, alignment = self.decode(decoder_input)
|
mel_output, gate_output, alignment = self.decode(decoder_input)
|
||||||
|
|
||||||
mel_outputs += [mel_output]
|
mel_outputs += [mel_output.squeeze(1)]
|
||||||
gate_outputs += [gate_output.squeeze(1)]
|
gate_outputs += [gate_output]
|
||||||
alignments += [alignment]
|
alignments += [alignment]
|
||||||
|
|
||||||
if F.sigmoid(gate_output.data) > self.gate_threshold:
|
if torch.sigmoid(gate_output.data) > self.gate_threshold:
|
||||||
break
|
break
|
||||||
elif len(mel_outputs) == self.max_decoder_steps:
|
elif len(mel_outputs) == self.max_decoder_steps:
|
||||||
print("Warning! Reached max decoder steps")
|
print("Warning! Reached max decoder steps")
|
||||||
|
|
@ -459,8 +468,9 @@ class Tacotron2(nn.Module):
|
||||||
self.n_frames_per_step = hparams.n_frames_per_step
|
self.n_frames_per_step = hparams.n_frames_per_step
|
||||||
self.embedding = nn.Embedding(
|
self.embedding = nn.Embedding(
|
||||||
hparams.n_symbols, hparams.symbols_embedding_dim)
|
hparams.n_symbols, hparams.symbols_embedding_dim)
|
||||||
torch.nn.init.xavier_uniform_(self.embedding.weight.data)
|
std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))
|
||||||
|
val = sqrt(3.0) * std # uniform bounds for std
|
||||||
|
self.embedding.weight.data.uniform_(-val, val)
|
||||||
self.encoder = Encoder(hparams)
|
self.encoder = Encoder(hparams)
|
||||||
self.decoder = Decoder(hparams)
|
self.decoder = Decoder(hparams)
|
||||||
self.postnet = Postnet(hparams)
|
self.postnet = Postnet(hparams)
|
||||||
|
|
@ -469,8 +479,8 @@ class Tacotron2(nn.Module):
|
||||||
text_padded, input_lengths, mel_padded, gate_padded, \
|
text_padded, input_lengths, mel_padded, gate_padded, \
|
||||||
output_lengths = batch
|
output_lengths = batch
|
||||||
text_padded = to_gpu(text_padded).long()
|
text_padded = to_gpu(text_padded).long()
|
||||||
max_len = int(torch.max(input_lengths.data).numpy())
|
|
||||||
input_lengths = to_gpu(input_lengths).long()
|
input_lengths = to_gpu(input_lengths).long()
|
||||||
|
max_len = torch.max(input_lengths.data).item()
|
||||||
mel_padded = to_gpu(mel_padded).float()
|
mel_padded = to_gpu(mel_padded).float()
|
||||||
gate_padded = to_gpu(gate_padded).float()
|
gate_padded = to_gpu(gate_padded).float()
|
||||||
output_lengths = to_gpu(output_lengths).long()
|
output_lengths = to_gpu(output_lengths).long()
|
||||||
|
|
@ -485,7 +495,7 @@ class Tacotron2(nn.Module):
|
||||||
|
|
||||||
def parse_output(self, outputs, output_lengths=None):
|
def parse_output(self, outputs, output_lengths=None):
|
||||||
if self.mask_padding and output_lengths is not None:
|
if self.mask_padding and output_lengths is not None:
|
||||||
mask = ~get_mask_from_lengths(output_lengths+1) # +1 <stop> token
|
mask = ~get_mask_from_lengths(output_lengths)
|
||||||
mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
|
mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
|
||||||
mask = mask.permute(1, 0, 2)
|
mask = mask.permute(1, 0, 2)
|
||||||
|
|
||||||
|
|
@ -494,7 +504,6 @@ class Tacotron2(nn.Module):
|
||||||
outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies
|
outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies
|
||||||
|
|
||||||
outputs = fp16_to_fp32(outputs) if self.fp16_run else outputs
|
outputs = fp16_to_fp32(outputs) if self.fp16_run else outputs
|
||||||
|
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
def forward(self, inputs):
|
def forward(self, inputs):
|
||||||
|
|
@ -512,14 +521,6 @@ class Tacotron2(nn.Module):
|
||||||
mel_outputs_postnet = self.postnet(mel_outputs)
|
mel_outputs_postnet = self.postnet(mel_outputs)
|
||||||
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
|
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
|
||||||
|
|
||||||
# DataParallel expects equal sized inputs/outputs, hence padding
|
|
||||||
if input_lengths is not None:
|
|
||||||
alignments = alignments.unsqueeze(0)
|
|
||||||
alignments = nn.functional.pad(
|
|
||||||
alignments,
|
|
||||||
(0, max_len - alignments.size(3), 0, 0),
|
|
||||||
"constant", 0)
|
|
||||||
alignments = alignments.squeeze()
|
|
||||||
return self.parse_output(
|
return self.parse_output(
|
||||||
[mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
|
[mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
|
||||||
output_lengths)
|
output_lengths)
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,3 @@
|
||||||
torch==0.4.0
|
|
||||||
matplotlib==2.1.0
|
matplotlib==2.1.0
|
||||||
tensorflow
|
tensorflow
|
||||||
numpy==1.13.3
|
numpy==1.13.3
|
||||||
|
|
|
||||||
2
stft.py
2
stft.py
|
|
@ -61,7 +61,7 @@ class STFT(torch.nn.Module):
|
||||||
np.linalg.pinv(scale * fourier_basis).T[:, None, :])
|
np.linalg.pinv(scale * fourier_basis).T[:, None, :])
|
||||||
|
|
||||||
if window is not None:
|
if window is not None:
|
||||||
assert(win_length >= filter_length)
|
assert(filter_length >= win_length)
|
||||||
# get window and zero center pad it to filter_length
|
# get window and zero center pad it to filter_length
|
||||||
fft_window = get_window(window, win_length, fftbins=True)
|
fft_window = get_window(window, win_length, fftbins=True)
|
||||||
fft_window = pad_center(fft_window, filter_length)
|
fft_window = pad_center(fft_window, filter_length)
|
||||||
|
|
|
||||||
|
|
@ -37,8 +37,6 @@ def text_to_sequence(text, cleaner_names):
|
||||||
sequence += _arpabet_to_sequence(m.group(2))
|
sequence += _arpabet_to_sequence(m.group(2))
|
||||||
text = m.group(3)
|
text = m.group(3)
|
||||||
|
|
||||||
# Append EOS token
|
|
||||||
sequence.append(_symbol_to_id['~'])
|
|
||||||
return sequence
|
return sequence
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,11 +7,12 @@ The default is a set of ASCII characters that works well for English or text tha
|
||||||
from text import cmudict
|
from text import cmudict
|
||||||
|
|
||||||
_pad = '_'
|
_pad = '_'
|
||||||
_eos = '~'
|
_punctuation = '!\'(),.:;? '
|
||||||
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
|
_special = '-'
|
||||||
|
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
|
||||||
|
|
||||||
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
||||||
_arpabet = ['@' + s for s in cmudict.valid_symbols]
|
_arpabet = ['@' + s for s in cmudict.valid_symbols]
|
||||||
|
|
||||||
# Export all symbols:
|
# Export all symbols:
|
||||||
symbols = [_pad, _eos] + list(_characters) + _arpabet
|
symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet
|
||||||
|
|
|
||||||
54
train.py
54
train.py
|
|
@ -5,9 +5,9 @@ import math
|
||||||
from numpy import finfo
|
from numpy import finfo
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from distributed import DistributedDataParallel
|
from distributed import apply_gradient_allreduce
|
||||||
|
import torch.distributed as dist
|
||||||
from torch.utils.data.distributed import DistributedSampler
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
from torch.nn import DataParallel
|
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
from fp16_optimizer import FP16_Optimizer
|
from fp16_optimizer import FP16_Optimizer
|
||||||
|
|
@ -30,19 +30,20 @@ def batchnorm_to_float(module):
|
||||||
|
|
||||||
def reduce_tensor(tensor, num_gpus):
|
def reduce_tensor(tensor, num_gpus):
|
||||||
rt = tensor.clone()
|
rt = tensor.clone()
|
||||||
torch.distributed.all_reduce(rt, op=torch.distributed.reduce_op.SUM)
|
dist.all_reduce(rt, op=dist.reduce_op.SUM)
|
||||||
rt /= num_gpus
|
rt /= num_gpus
|
||||||
return rt
|
return rt
|
||||||
|
|
||||||
|
|
||||||
def init_distributed(hparams, n_gpus, rank, group_name):
|
def init_distributed(hparams, n_gpus, rank, group_name):
|
||||||
assert torch.cuda.is_available(), "Distributed mode requires CUDA."
|
assert torch.cuda.is_available(), "Distributed mode requires CUDA."
|
||||||
print("Initializing distributed")
|
print("Initializing Distributed")
|
||||||
|
|
||||||
# Set cuda device so everything is done on the right GPU.
|
# Set cuda device so everything is done on the right GPU.
|
||||||
torch.cuda.set_device(rank % torch.cuda.device_count())
|
torch.cuda.set_device(rank % torch.cuda.device_count())
|
||||||
|
|
||||||
# Initialize distributed communication
|
# Initialize distributed communication
|
||||||
torch.distributed.init_process_group(
|
dist.init_process_group(
|
||||||
backend=hparams.dist_backend, init_method=hparams.dist_url,
|
backend=hparams.dist_backend, init_method=hparams.dist_url,
|
||||||
world_size=n_gpus, rank=rank, group_name=group_name)
|
world_size=n_gpus, rank=rank, group_name=group_name)
|
||||||
|
|
||||||
|
|
@ -131,22 +132,20 @@ def validate(model, criterion, valset, iteration, batch_size, n_gpus,
|
||||||
pin_memory=False, collate_fn=collate_fn)
|
pin_memory=False, collate_fn=collate_fn)
|
||||||
|
|
||||||
val_loss = 0.0
|
val_loss = 0.0
|
||||||
if distributed_run or torch.cuda.device_count() > 1:
|
|
||||||
batch_parser = model.module.parse_batch
|
|
||||||
else:
|
|
||||||
batch_parser = model.parse_batch
|
|
||||||
|
|
||||||
for i, batch in enumerate(val_loader):
|
for i, batch in enumerate(val_loader):
|
||||||
x, y = batch_parser(batch)
|
x, y = model.parse_batch(batch)
|
||||||
y_pred = model(x)
|
y_pred = model(x)
|
||||||
loss = criterion(y_pred, y)
|
loss = criterion(y_pred, y)
|
||||||
reduced_val_loss = reduce_tensor(loss.data, n_gpus)[0] \
|
if distributed_run:
|
||||||
if distributed_run else loss.data[0]
|
reduced_val_loss = reduce_tensor(loss.data, num_gpus).item()
|
||||||
|
else:
|
||||||
|
reduced_val_loss = loss.item()
|
||||||
val_loss += reduced_val_loss
|
val_loss += reduced_val_loss
|
||||||
val_loss = val_loss / (i + 1)
|
val_loss = val_loss / (i + 1)
|
||||||
|
|
||||||
model.train()
|
model.train()
|
||||||
return val_loss
|
print("Validation loss {}: {:9f} ".format(iteration, reduced_val_loss))
|
||||||
|
logger.log_validation(reduced_val_loss, model, y, y_pred, iteration)
|
||||||
|
|
||||||
|
|
||||||
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
|
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
|
||||||
|
|
@ -176,6 +175,9 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
|
||||||
optimizer = FP16_Optimizer(
|
optimizer = FP16_Optimizer(
|
||||||
optimizer, dynamic_loss_scale=hparams.dynamic_loss_scaling)
|
optimizer, dynamic_loss_scale=hparams.dynamic_loss_scaling)
|
||||||
|
|
||||||
|
if hparams.distributed_run:
|
||||||
|
model = apply_gradient_allreduce(model)
|
||||||
|
|
||||||
criterion = Tacotron2Loss()
|
criterion = Tacotron2Loss()
|
||||||
|
|
||||||
logger = prepare_directories_and_logger(
|
logger = prepare_directories_and_logger(
|
||||||
|
|
@ -194,15 +196,10 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
|
||||||
checkpoint_path, model, optimizer)
|
checkpoint_path, model, optimizer)
|
||||||
if hparams.use_saved_learning_rate:
|
if hparams.use_saved_learning_rate:
|
||||||
learning_rate = _learning_rate
|
learning_rate = _learning_rate
|
||||||
|
|
||||||
iteration += 1 # next iteration is iteration + 1
|
iteration += 1 # next iteration is iteration + 1
|
||||||
epoch_offset = max(0, int(iteration / len(train_loader)))
|
epoch_offset = max(0, int(iteration / len(train_loader)))
|
||||||
|
|
||||||
model.train()
|
model.train()
|
||||||
if hparams.distributed_run or torch.cuda.device_count() > 1:
|
|
||||||
batch_parser = model.module.parse_batch
|
|
||||||
else:
|
|
||||||
batch_parser = model.parse_batch
|
|
||||||
# ================ MAIN TRAINNIG LOOP! ===================
|
# ================ MAIN TRAINNIG LOOP! ===================
|
||||||
for epoch in range(epoch_offset, hparams.epochs):
|
for epoch in range(epoch_offset, hparams.epochs):
|
||||||
print("Epoch: {}".format(epoch))
|
print("Epoch: {}".format(epoch))
|
||||||
|
|
@ -212,18 +209,21 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
|
||||||
param_group['lr'] = learning_rate
|
param_group['lr'] = learning_rate
|
||||||
|
|
||||||
model.zero_grad()
|
model.zero_grad()
|
||||||
x, y = batch_parser(batch)
|
x, y = model.parse_batch(batch)
|
||||||
y_pred = model(x)
|
y_pred = model(x)
|
||||||
|
|
||||||
loss = criterion(y_pred, y)
|
loss = criterion(y_pred, y)
|
||||||
reduced_loss = reduce_tensor(loss.data, n_gpus)[0] \
|
if hparams.distributed_run:
|
||||||
if hparams.distributed_run else loss.data[0]
|
reduced_loss = reduce_tensor(loss.data, num_gpus).item()
|
||||||
|
else:
|
||||||
|
reduced_loss = loss.item()
|
||||||
|
|
||||||
if hparams.fp16_run:
|
if hparams.fp16_run:
|
||||||
optimizer.backward(loss)
|
optimizer.backward(loss)
|
||||||
grad_norm = optimizer.clip_fp32_grads(hparams.grad_clip_thresh)
|
grad_norm = optimizer.clip_fp32_grads(hparams.grad_clip_thresh)
|
||||||
else:
|
else:
|
||||||
loss.backward()
|
loss.backward()
|
||||||
grad_norm = torch.nn.utils.clip_grad_norm(
|
grad_norm = torch.nn.utils.clip_grad_norm_(
|
||||||
model.parameters(), hparams.grad_clip_thresh)
|
model.parameters(), hparams.grad_clip_thresh)
|
||||||
|
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
|
@ -234,20 +234,14 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
|
||||||
duration = time.perf_counter() - start
|
duration = time.perf_counter() - start
|
||||||
print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
|
print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
|
||||||
iteration, reduced_loss, grad_norm, duration))
|
iteration, reduced_loss, grad_norm, duration))
|
||||||
|
|
||||||
logger.log_training(
|
logger.log_training(
|
||||||
reduced_loss, grad_norm, learning_rate, duration, iteration)
|
reduced_loss, grad_norm, learning_rate, duration, iteration)
|
||||||
|
|
||||||
if not overflow and (iteration % hparams.iters_per_checkpoint == 0):
|
if not overflow and (iteration % hparams.iters_per_checkpoint == 0):
|
||||||
reduced_val_loss = validate(
|
validate(model, criterion, valset, iteration, hparams.batch_size,
|
||||||
model, criterion, valset, iteration, hparams.batch_size,
|
|
||||||
n_gpus, collate_fn, logger, hparams.distributed_run, rank)
|
n_gpus, collate_fn, logger, hparams.distributed_run, rank)
|
||||||
|
|
||||||
if rank == 0:
|
if rank == 0:
|
||||||
print("Validation loss {}: {:9f} ".format(
|
|
||||||
iteration, reduced_val_loss))
|
|
||||||
logger.log_validation(
|
|
||||||
reduced_val_loss, model, y, y_pred, iteration)
|
|
||||||
checkpoint_path = os.path.join(
|
checkpoint_path = os.path.join(
|
||||||
output_directory, "checkpoint_{}".format(iteration))
|
output_directory, "checkpoint_{}".format(iteration))
|
||||||
save_checkpoint(model, optimizer, learning_rate, iteration,
|
save_checkpoint(model, optimizer, learning_rate, iteration,
|
||||||
|
|
|
||||||
21
utils.py
21
utils.py
|
|
@ -4,29 +4,26 @@ import torch
|
||||||
|
|
||||||
|
|
||||||
def get_mask_from_lengths(lengths):
|
def get_mask_from_lengths(lengths):
|
||||||
max_len = torch.max(lengths)
|
max_len = torch.max(lengths).item()
|
||||||
ids = torch.arange(0, max_len).long().cuda()
|
ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len))
|
||||||
mask = (ids < lengths.unsqueeze(1)).byte()
|
mask = (ids < lengths.unsqueeze(1)).byte()
|
||||||
return mask
|
return mask
|
||||||
|
|
||||||
|
|
||||||
def load_wav_to_torch(full_path, sr):
|
def load_wav_to_torch(full_path):
|
||||||
sampling_rate, data = read(full_path)
|
sampling_rate, data = read(full_path)
|
||||||
assert sr == sampling_rate, "{} SR doesn't match {} on path {}".format(
|
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
|
||||||
sr, sampling_rate, full_path)
|
|
||||||
return torch.FloatTensor(data.astype(np.float32))
|
|
||||||
|
|
||||||
|
|
||||||
def load_filepaths_and_text(filename, sort_by_length, split="|"):
|
def load_filepaths_and_text(filename, split="|"):
|
||||||
with open(filename, encoding='utf-8') as f:
|
with open(filename, encoding='utf-8') as f:
|
||||||
filepaths_and_text = [line.strip().split(split) for line in f]
|
filepaths_and_text = [line.strip().split(split) for line in f]
|
||||||
|
|
||||||
if sort_by_length:
|
|
||||||
filepaths_and_text.sort(key=lambda x: len(x[1]))
|
|
||||||
|
|
||||||
return filepaths_and_text
|
return filepaths_and_text
|
||||||
|
|
||||||
|
|
||||||
def to_gpu(x):
|
def to_gpu(x):
|
||||||
x = x.contiguous().cuda(async=True)
|
x = x.contiguous()
|
||||||
|
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
x = x.cuda(non_blocking=True)
|
||||||
return torch.autograd.Variable(x)
|
return torch.autograd.Variable(x)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue