mirror of
https://github.com/malarinv/tacotron2
synced 2026-03-08 09:42:34 +00:00
Compare commits
1 Commits
attention_
...
single-gpu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
723e869d4b |
@@ -1,4 +1,3 @@
|
|||||||
FROM pytorch/pytorch:0.4_cuda9_cudnn7
|
FROM pytorch/pytorch:0.4_cuda9_cudnn7
|
||||||
|
|
||||||
RUN pip install numpy scipy matplotlib librosa==0.6.0 tensorflow tensorboardX
|
RUN pip install numpy scipy matplotlib librosa==0.6.0 tensorflow tensorboardX inflect==0.2.5 Unidecode==1.0.22
|
||||||
inflect==0.2.5 Unidecode==1.0.22
|
|
||||||
|
|||||||
@@ -20,12 +20,11 @@ Distributed and FP16 support relies on work by Christian Sarofeen and NVIDIA's
|
|||||||
2. Clone this repo: `git clone https://github.com/NVIDIA/tacotron2.git`
|
2. Clone this repo: `git clone https://github.com/NVIDIA/tacotron2.git`
|
||||||
3. CD into this repo: `cd tacotron2`
|
3. CD into this repo: `cd tacotron2`
|
||||||
4. Update .wav paths: `sed -i -- 's,DUMMY,ljs_dataset_folder/wavs,g' filelists/*.txt`
|
4. Update .wav paths: `sed -i -- 's,DUMMY,ljs_dataset_folder/wavs,g' filelists/*.txt`
|
||||||
- Alternatively, set `load_mel_from_disk=True` in `hparams.py` and update mel-spectrogram paths
|
|
||||||
5. Install [pytorch 0.4](https://github.com/pytorch/pytorch)
|
5. Install [pytorch 0.4](https://github.com/pytorch/pytorch)
|
||||||
6. Install python requirements or build docker image
|
6. Install python requirements or use docker container (tbd)
|
||||||
- Install python requirements: `pip install -r requirements.txt`
|
- Install python requirements: `pip install requirements.txt`
|
||||||
- **OR**
|
- **OR**
|
||||||
- Build docker image: `docker build --tag tacotron2 .`
|
- Docker container `(tbd)`
|
||||||
|
|
||||||
## Training
|
## Training
|
||||||
1. `python train.py --output_directory=outdir --log_directory=logdir`
|
1. `python train.py --output_directory=outdir --log_directory=logdir`
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
import random
|
import random
|
||||||
import numpy as np
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.data
|
import torch.utils.data
|
||||||
|
|
||||||
@@ -20,7 +19,6 @@ class TextMelLoader(torch.utils.data.Dataset):
|
|||||||
self.text_cleaners = hparams.text_cleaners
|
self.text_cleaners = hparams.text_cleaners
|
||||||
self.max_wav_value = hparams.max_wav_value
|
self.max_wav_value = hparams.max_wav_value
|
||||||
self.sampling_rate = hparams.sampling_rate
|
self.sampling_rate = hparams.sampling_rate
|
||||||
self.load_mel_from_disk = hparams.load_mel_from_disk
|
|
||||||
self.stft = layers.TacotronSTFT(
|
self.stft = layers.TacotronSTFT(
|
||||||
hparams.filter_length, hparams.hop_length, hparams.win_length,
|
hparams.filter_length, hparams.hop_length, hparams.win_length,
|
||||||
hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
|
hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
|
||||||
@@ -37,19 +35,12 @@ class TextMelLoader(torch.utils.data.Dataset):
|
|||||||
return (text, mel)
|
return (text, mel)
|
||||||
|
|
||||||
def get_mel(self, filename):
|
def get_mel(self, filename):
|
||||||
if not self.load_mel_from_disk:
|
audio = load_wav_to_torch(filename, self.sampling_rate)
|
||||||
audio = load_wav_to_torch(filename, self.sampling_rate)
|
audio_norm = audio / self.max_wav_value
|
||||||
audio_norm = audio / self.max_wav_value
|
audio_norm = audio_norm.unsqueeze(0)
|
||||||
audio_norm = audio_norm.unsqueeze(0)
|
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
|
||||||
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
|
melspec = self.stft.mel_spectrogram(audio_norm)
|
||||||
melspec = self.stft.mel_spectrogram(audio_norm)
|
melspec = torch.squeeze(melspec, 0)
|
||||||
melspec = torch.squeeze(melspec, 0)
|
|
||||||
else:
|
|
||||||
melspec = torch.from_numpy(np.load(filename))
|
|
||||||
assert melspec.size(0) == self.stft.n_mel_channels, (
|
|
||||||
'Mel dimension mismatch: given {}, expected {}'.format(
|
|
||||||
melspec.size(0), self.stft.n_mel_channels))
|
|
||||||
|
|
||||||
return melspec
|
return melspec
|
||||||
|
|
||||||
def get_text(self, text):
|
def get_text(self, text):
|
||||||
|
|||||||
@@ -23,7 +23,6 @@ def create_hparams(hparams_string=None, verbose=False):
|
|||||||
################################
|
################################
|
||||||
# Data Parameters #
|
# Data Parameters #
|
||||||
################################
|
################################
|
||||||
load_mel_from_disk=False,
|
|
||||||
training_files='filelists/ljs_audio_text_train_filelist.txt',
|
training_files='filelists/ljs_audio_text_train_filelist.txt',
|
||||||
validation_files='filelists/ljs_audio_text_val_filelist.txt',
|
validation_files='filelists/ljs_audio_text_val_filelist.txt',
|
||||||
text_cleaners=['english_cleaners'],
|
text_cleaners=['english_cleaners'],
|
||||||
|
|||||||
@@ -98,11 +98,8 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"checkpoint_path = \"/home/scratch.adlr-gcf/audio_denoising/runs/TTS-Tacotron2-LJS-MSE-DRC-NoMaskPadding-Unsorted-Distributed-22khz/checkpoint_15500\"\n",
|
"checkpoint_path = \"/home/scratch.adlr-gcf/audio_denoising/runs/TTS-Tacotron2-LJS-MSE-DRC-NoMaskPadding-Unsorted-Distributed-22khz/checkpoint_15500\"\n",
|
||||||
"model = load_model(hparams)\n",
|
"model = load_model(hparams)\n",
|
||||||
"try:\n",
|
"model.load_state_dict(torch.load(checkpoint_path)['state_dict'])\n",
|
||||||
" model = model.module\n",
|
"model = model.module\n",
|
||||||
"except:\n",
|
|
||||||
" pass\n",
|
|
||||||
"model.load_state_dict({k.replace('module.',''):v for k,v in torch.load(checkpoint_path)['state_dict'].items()})\n",
|
|
||||||
"_ = model.eval()"
|
"_ = model.eval()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -51,10 +51,11 @@ class DynamicLossScaler:
|
|||||||
|
|
||||||
# `x` is a torch.Tensor
|
# `x` is a torch.Tensor
|
||||||
def _has_inf_or_nan(x):
|
def _has_inf_or_nan(x):
|
||||||
cpu_sum = float(x.float().sum())
|
inf_count = torch.sum(x.abs() == float('inf'))
|
||||||
if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
|
if inf_count > 0:
|
||||||
return True
|
return True
|
||||||
return False
|
nan_count = torch.sum(x != x)
|
||||||
|
return nan_count > 0
|
||||||
|
|
||||||
# `overflow` is boolean indicating whether we overflowed in gradient
|
# `overflow` is boolean indicating whether we overflowed in gradient
|
||||||
def update_scale(self, overflow):
|
def update_scale(self, overflow):
|
||||||
|
|||||||
23
model.py
23
model.py
@@ -221,7 +221,7 @@ class Decoder(nn.Module):
|
|||||||
[hparams.prenet_dim, hparams.prenet_dim])
|
[hparams.prenet_dim, hparams.prenet_dim])
|
||||||
|
|
||||||
self.attention_rnn = nn.LSTMCell(
|
self.attention_rnn = nn.LSTMCell(
|
||||||
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
|
hparams.prenet_dim + hparams.encoder_embedding_dim,
|
||||||
hparams.attention_rnn_dim)
|
hparams.attention_rnn_dim)
|
||||||
|
|
||||||
self.attention_layer = Attention(
|
self.attention_layer = Attention(
|
||||||
@@ -230,7 +230,7 @@ class Decoder(nn.Module):
|
|||||||
hparams.attention_location_kernel_size)
|
hparams.attention_location_kernel_size)
|
||||||
|
|
||||||
self.decoder_rnn = nn.LSTMCell(
|
self.decoder_rnn = nn.LSTMCell(
|
||||||
hparams.prenet_dim + hparams.encoder_embedding_dim,
|
hparams.attention_rnn_dim + hparams.encoder_embedding_dim,
|
||||||
hparams.decoder_rnn_dim, 1)
|
hparams.decoder_rnn_dim, 1)
|
||||||
|
|
||||||
self.linear_projection = LinearNorm(
|
self.linear_projection = LinearNorm(
|
||||||
@@ -351,7 +351,8 @@ class Decoder(nn.Module):
|
|||||||
attention_weights:
|
attention_weights:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cell_input = torch.cat((self.decoder_hidden, self.attention_context), -1)
|
decoder_input = self.prenet(decoder_input)
|
||||||
|
cell_input = torch.cat((decoder_input, self.attention_context), -1)
|
||||||
self.attention_hidden, self.attention_cell = self.attention_rnn(
|
self.attention_hidden, self.attention_cell = self.attention_rnn(
|
||||||
cell_input, (self.attention_hidden, self.attention_cell))
|
cell_input, (self.attention_hidden, self.attention_cell))
|
||||||
|
|
||||||
@@ -363,8 +364,8 @@ class Decoder(nn.Module):
|
|||||||
attention_weights_cat, self.mask)
|
attention_weights_cat, self.mask)
|
||||||
|
|
||||||
self.attention_weights_cum += self.attention_weights
|
self.attention_weights_cum += self.attention_weights
|
||||||
prenet_output = self.prenet(decoder_input)
|
decoder_input = torch.cat(
|
||||||
decoder_input = torch.cat((prenet_output, self.attention_context), -1)
|
(self.attention_hidden, self.attention_context), -1)
|
||||||
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
|
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
|
||||||
decoder_input, (self.decoder_hidden, self.decoder_cell))
|
decoder_input, (self.decoder_hidden, self.decoder_cell))
|
||||||
|
|
||||||
@@ -401,8 +402,9 @@ class Decoder(nn.Module):
|
|||||||
while len(mel_outputs) < decoder_inputs.size(0):
|
while len(mel_outputs) < decoder_inputs.size(0):
|
||||||
mel_output, gate_output, attention_weights = self.decode(
|
mel_output, gate_output, attention_weights = self.decode(
|
||||||
decoder_input)
|
decoder_input)
|
||||||
mel_outputs += [mel_output]
|
|
||||||
gate_outputs += [gate_output.squeeze(1)]
|
mel_outputs += [mel_output.squeeze(1)]
|
||||||
|
gate_outputs += [gate_output.squeeze()]
|
||||||
alignments += [attention_weights]
|
alignments += [attention_weights]
|
||||||
|
|
||||||
decoder_input = decoder_inputs[len(mel_outputs) - 1]
|
decoder_input = decoder_inputs[len(mel_outputs) - 1]
|
||||||
@@ -429,11 +431,12 @@ class Decoder(nn.Module):
|
|||||||
self.initialize_decoder_states(memory, mask=None)
|
self.initialize_decoder_states(memory, mask=None)
|
||||||
|
|
||||||
mel_outputs, gate_outputs, alignments = [], [], []
|
mel_outputs, gate_outputs, alignments = [], [], []
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
mel_output, gate_output, alignment = self.decode(decoder_input)
|
mel_output, gate_output, alignment = self.decode(decoder_input)
|
||||||
|
|
||||||
mel_outputs += [mel_output]
|
mel_outputs += [mel_output.squeeze(1)]
|
||||||
gate_outputs += [gate_output.squeeze(1)]
|
gate_outputs += [gate_output.squeeze()]
|
||||||
alignments += [alignment]
|
alignments += [alignment]
|
||||||
|
|
||||||
if F.sigmoid(gate_output.data) > self.gate_threshold:
|
if F.sigmoid(gate_output.data) > self.gate_threshold:
|
||||||
@@ -467,8 +470,8 @@ class Tacotron2(nn.Module):
|
|||||||
text_padded, input_lengths, mel_padded, gate_padded, \
|
text_padded, input_lengths, mel_padded, gate_padded, \
|
||||||
output_lengths = batch
|
output_lengths = batch
|
||||||
text_padded = to_gpu(text_padded).long()
|
text_padded = to_gpu(text_padded).long()
|
||||||
max_len = int(torch.max(input_lengths.data).numpy())
|
|
||||||
input_lengths = to_gpu(input_lengths).long()
|
input_lengths = to_gpu(input_lengths).long()
|
||||||
|
max_len = torch.max(input_lengths.data)
|
||||||
mel_padded = to_gpu(mel_padded).float()
|
mel_padded = to_gpu(mel_padded).float()
|
||||||
gate_padded = to_gpu(gate_padded).float()
|
gate_padded = to_gpu(gate_padded).float()
|
||||||
output_lengths = to_gpu(output_lengths).long()
|
output_lengths = to_gpu(output_lengths).long()
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
torch==0.2.0.post3
|
torch==0.2.0.post3
|
||||||
matplotlib==2.1.0
|
matplotlib==2.1.0
|
||||||
tensorflow
|
tensorflow==1.5.0
|
||||||
numpy==1.13.3
|
numpy==1.13.3
|
||||||
inflect==0.2.5
|
inflect==0.2.5
|
||||||
librosa==0.6.0
|
librosa==0.6.0
|
||||||
|
|||||||
7
train.py
7
train.py
@@ -2,7 +2,6 @@ import os
|
|||||||
import time
|
import time
|
||||||
import argparse
|
import argparse
|
||||||
import math
|
import math
|
||||||
from numpy import finfo
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from distributed import DistributedDataParallel
|
from distributed import DistributedDataParallel
|
||||||
@@ -78,9 +77,7 @@ def prepare_directories_and_logger(output_directory, log_directory, rank):
|
|||||||
|
|
||||||
def load_model(hparams):
|
def load_model(hparams):
|
||||||
model = Tacotron2(hparams).cuda()
|
model = Tacotron2(hparams).cuda()
|
||||||
if hparams.fp16_run:
|
model = batchnorm_to_float(model.half()) if hparams.fp16_run else model
|
||||||
model = batchnorm_to_float(model.half())
|
|
||||||
model.decoder.attention_layer.score_mask_value = float(finfo('float16').min)
|
|
||||||
|
|
||||||
if hparams.distributed_run:
|
if hparams.distributed_run:
|
||||||
model = DistributedDataParallel(model)
|
model = DistributedDataParallel(model)
|
||||||
@@ -279,7 +276,7 @@ if __name__ == '__main__':
|
|||||||
torch.backends.cudnn.benchmark = hparams.cudnn_benchmark
|
torch.backends.cudnn.benchmark = hparams.cudnn_benchmark
|
||||||
|
|
||||||
print("FP16 Run:", hparams.fp16_run)
|
print("FP16 Run:", hparams.fp16_run)
|
||||||
print("Dynamic Loss Scaling:", hparams.dynamic_loss_scaling)
|
print("Dynamic Loss Scaling", hparams.dynamic_loss_scaling)
|
||||||
print("Distributed Run:", hparams.distributed_run)
|
print("Distributed Run:", hparams.distributed_run)
|
||||||
print("cuDNN Enabled:", hparams.cudnn_enabled)
|
print("cuDNN Enabled:", hparams.cudnn_enabled)
|
||||||
print("cuDNN Benchmark:", hparams.cudnn_benchmark)
|
print("cuDNN Benchmark:", hparams.cudnn_benchmark)
|
||||||
|
|||||||
2
utils.py
2
utils.py
@@ -5,7 +5,7 @@ import torch
|
|||||||
|
|
||||||
def get_mask_from_lengths(lengths):
|
def get_mask_from_lengths(lengths):
|
||||||
max_len = torch.max(lengths)
|
max_len = torch.max(lengths)
|
||||||
ids = torch.arange(0, max_len).long().cuda()
|
ids = torch.arange(0, max_len, out=torch.LongTensor(max_len)).cuda()
|
||||||
mask = (ids < lengths.unsqueeze(1)).byte()
|
mask = (ids < lengths.unsqueeze(1)).byte()
|
||||||
return mask
|
return mask
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user