mirror of
https://github.com/malarinv/tacotron2
synced 2026-03-08 09:42:34 +00:00
Compare commits
15 Commits
mask-utils
...
load_mel_f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1071023017 | ||
|
|
cd851585cb | ||
|
|
2da7a2ebab | ||
|
|
62d2c8b957 | ||
|
|
2d41ea0682 | ||
|
|
20568643cb | ||
|
|
dcd925f6c8 | ||
|
|
4ac6ce9ab5 | ||
|
|
c67ca6531e | ||
|
|
78d5150d83 | ||
|
|
424b2f5bf0 | ||
|
|
a38429e629 | ||
|
|
b20765a3dc | ||
|
|
2a394f4aaa | ||
|
|
2c545ac800 |
@@ -20,9 +20,10 @@ Distributed and FP16 support relies on work by Christian Sarofeen and NVIDIA's
|
||||
2. Clone this repo: `git clone https://github.com/NVIDIA/tacotron2.git`
|
||||
3. CD into this repo: `cd tacotron2`
|
||||
4. Update .wav paths: `sed -i -- 's,DUMMY,ljs_dataset_folder/wavs,g' filelists/*.txt`
|
||||
- Alternatively, set `load_mel_from_disk=True` in `hparams.py` and update mel-spectrogram paths
|
||||
5. Install [pytorch 0.4](https://github.com/pytorch/pytorch)
|
||||
6. Install python requirements or build docker image
|
||||
- Install python requirements: `pip install requirements.txt`
|
||||
- Install python requirements: `pip install -r requirements.txt`
|
||||
- **OR**
|
||||
- Build docker image: `docker build --tag tacotron2 .`
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import random
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.utils.data
|
||||
|
||||
@@ -19,6 +20,7 @@ class TextMelLoader(torch.utils.data.Dataset):
|
||||
self.text_cleaners = hparams.text_cleaners
|
||||
self.max_wav_value = hparams.max_wav_value
|
||||
self.sampling_rate = hparams.sampling_rate
|
||||
self.load_mel_from_disk = hparams.load_mel_from_disk
|
||||
self.stft = layers.TacotronSTFT(
|
||||
hparams.filter_length, hparams.hop_length, hparams.win_length,
|
||||
hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
|
||||
@@ -35,12 +37,19 @@ class TextMelLoader(torch.utils.data.Dataset):
|
||||
return (text, mel)
|
||||
|
||||
def get_mel(self, filename):
|
||||
audio = load_wav_to_torch(filename, self.sampling_rate)
|
||||
audio_norm = audio / self.max_wav_value
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
|
||||
melspec = self.stft.mel_spectrogram(audio_norm)
|
||||
melspec = torch.squeeze(melspec, 0)
|
||||
if not self.load_mel_from_disk:
|
||||
audio = load_wav_to_torch(filename, self.sampling_rate)
|
||||
audio_norm = audio / self.max_wav_value
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
|
||||
melspec = self.stft.mel_spectrogram(audio_norm)
|
||||
melspec = torch.squeeze(melspec, 0)
|
||||
else:
|
||||
melspec = torch.from_numpy(np.load(filename))
|
||||
assert melspec.size(0) == self.stft.n_mel_channels, (
|
||||
'Mel dimension mismatch: given {}, expected {}'.format(
|
||||
melspec.size(0), self.stft.n_mel_channels))
|
||||
|
||||
return melspec
|
||||
|
||||
def get_text(self, text):
|
||||
|
||||
@@ -23,6 +23,7 @@ def create_hparams(hparams_string=None, verbose=False):
|
||||
################################
|
||||
# Data Parameters #
|
||||
################################
|
||||
load_mel_from_disk=False,
|
||||
training_files='filelists/ljs_audio_text_train_filelist.txt',
|
||||
validation_files='filelists/ljs_audio_text_val_filelist.txt',
|
||||
text_cleaners=['english_cleaners'],
|
||||
|
||||
@@ -98,8 +98,11 @@
|
||||
"source": [
|
||||
"checkpoint_path = \"/home/scratch.adlr-gcf/audio_denoising/runs/TTS-Tacotron2-LJS-MSE-DRC-NoMaskPadding-Unsorted-Distributed-22khz/checkpoint_15500\"\n",
|
||||
"model = load_model(hparams)\n",
|
||||
"model.load_state_dict(torch.load(checkpoint_path)['state_dict'])\n",
|
||||
"model = model.module\n",
|
||||
"try:\n",
|
||||
" model = model.module\n",
|
||||
"except:\n",
|
||||
" pass\n",
|
||||
"model.load_state_dict({k.replace('module.',''):v for k,v in torch.load(checkpoint_path)['state_dict'].items()})\n",
|
||||
"_ = model.eval()"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -51,11 +51,10 @@ class DynamicLossScaler:
|
||||
|
||||
# `x` is a torch.Tensor
|
||||
def _has_inf_or_nan(x):
|
||||
inf_count = torch.sum(x.abs() == float('inf'))
|
||||
if inf_count > 0:
|
||||
cpu_sum = float(x.float().sum())
|
||||
if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
|
||||
return True
|
||||
nan_count = torch.sum(x != x)
|
||||
return nan_count > 0
|
||||
return False
|
||||
|
||||
# `overflow` is boolean indicating whether we overflowed in gradient
|
||||
def update_scale(self, overflow):
|
||||
|
||||
12
model.py
12
model.py
@@ -402,9 +402,8 @@ class Decoder(nn.Module):
|
||||
while len(mel_outputs) < decoder_inputs.size(0):
|
||||
mel_output, gate_output, attention_weights = self.decode(
|
||||
decoder_input)
|
||||
|
||||
mel_outputs += [mel_output.squeeze(1)]
|
||||
gate_outputs += [gate_output.squeeze()]
|
||||
mel_outputs += [mel_output]
|
||||
gate_outputs += [gate_output.squeeze(1)]
|
||||
alignments += [attention_weights]
|
||||
|
||||
decoder_input = decoder_inputs[len(mel_outputs) - 1]
|
||||
@@ -431,12 +430,11 @@ class Decoder(nn.Module):
|
||||
self.initialize_decoder_states(memory, mask=None)
|
||||
|
||||
mel_outputs, gate_outputs, alignments = [], [], []
|
||||
|
||||
while True:
|
||||
mel_output, gate_output, alignment = self.decode(decoder_input)
|
||||
|
||||
mel_outputs += [mel_output.squeeze(1)]
|
||||
gate_outputs += [gate_output.squeeze()]
|
||||
mel_outputs += [mel_output]
|
||||
gate_outputs += [gate_output.squeeze(1)]
|
||||
alignments += [alignment]
|
||||
|
||||
if F.sigmoid(gate_output.data) > self.gate_threshold:
|
||||
@@ -470,8 +468,8 @@ class Tacotron2(nn.Module):
|
||||
text_padded, input_lengths, mel_padded, gate_padded, \
|
||||
output_lengths = batch
|
||||
text_padded = to_gpu(text_padded).long()
|
||||
max_len = int(torch.max(input_lengths.data).numpy())
|
||||
input_lengths = to_gpu(input_lengths).long()
|
||||
max_len = torch.max(input_lengths.data)
|
||||
mel_padded = to_gpu(mel_padded).float()
|
||||
gate_padded = to_gpu(gate_padded).float()
|
||||
output_lengths = to_gpu(output_lengths).long()
|
||||
|
||||
5
train.py
5
train.py
@@ -2,6 +2,7 @@ import os
|
||||
import time
|
||||
import argparse
|
||||
import math
|
||||
from numpy import finfo
|
||||
|
||||
import torch
|
||||
from distributed import DistributedDataParallel
|
||||
@@ -77,7 +78,9 @@ def prepare_directories_and_logger(output_directory, log_directory, rank):
|
||||
|
||||
def load_model(hparams):
|
||||
model = Tacotron2(hparams).cuda()
|
||||
model = batchnorm_to_float(model.half()) if hparams.fp16_run else model
|
||||
if hparams.fp16_run:
|
||||
model = batchnorm_to_float(model.half())
|
||||
model.decoder.attention_layer.score_mask_value = float(finfo('float16').min)
|
||||
|
||||
if hparams.distributed_run:
|
||||
model = DistributedDataParallel(model)
|
||||
|
||||
Reference in New Issue
Block a user