1
0
mirror of https://github.com/malarinv/tacotron2 synced 2026-03-08 09:42:34 +00:00

75 Commits

Author SHA1 Message Date
6d3788d858 1. using SentencePiece, Pretrained BPEmb
2. Using 40 Mel channels with 4000Hz upperbound
2019-09-24 10:29:34 +05:30
f449105b79 1. updated requirements
2. spm params explicit
3. gitignore and script
2019-09-23 16:26:54 +05:30
4d5001bdf0 1. using sentencepiece for text_to_seq
2. using 40 mel channels
3. added makefile and .gitignore
2019-09-23 15:30:43 +05:30
Rafael Valle
131c1465b4 Merge pull request #188 from jybaek/fixed-waveglow-link
Fixed link to download waveglow from inference.py
2019-04-22 16:49:14 -07:00
jybaek
d5321ff0ca Fixed link to download waveglow from inference.py 2019-04-19 15:21:09 +09:00
rafaelvalle
c76ac3b211 README.md: clarifying terminology 2019-04-03 14:59:20 -07:00
rafaelvalle
e3d2d0a5ef README.md: using proper nomenclature 2019-04-03 14:56:06 -07:00
rafaelvalle
a992aea070 README.md: updating terminology 2019-04-03 14:54:45 -07:00
rafaelvalle
eb2a171690 Merge branch 'master' of https://github.com/NVIDIA/tacotron2 2019-04-03 13:51:59 -08:00
rafaelvalle
821bfeba5d README.md: adding instructions to install apex 2019-04-03 13:51:36 -08:00
rafaelvalle
d6670c8ed7 Dockerfile: updating to use latest pytorch and apex 2019-04-03 13:51:22 -08:00
rafaelvalle
0274619e45 train.py: using amp for mixed precision training 2019-04-03 13:42:00 -08:00
rafaelvalle
bb20035586 inference.ipynb: adding fp16 inference 2019-04-03 13:41:11 -08:00
rafaelvalle
1480f82908 model.py: renaming variables, removing dropout from lstm cell state, removing conversions now handled by amp 2019-04-03 13:36:35 -08:00
rafaelvalle
087c86755f logger.py: using new pytorch api 2019-04-03 13:35:04 -08:00
Rafael Valle
ece7d3f568 train.py: changing dataloder params given sampler 2019-03-19 13:47:01 -07:00
rafaelvalle
f37998c59d train.py: shuffling at every epoch 2019-03-15 17:49:27 -07:00
rafaelvalle
bff304f432 README.md: adding explanation on training from pre-trained model 2019-03-15 17:38:40 -07:00
rafaelvalle
3869781877 train.py: adding routine to warm start and ignore layers, e.g. embedding.weight 2019-03-15 17:34:27 -07:00
rafaelvalle
bb67613493 hparams.py: adding ignore_layers argument to ignore text embedding layers when warm_starting 2019-03-15 17:28:50 -07:00
rafaelvalle
af1f71a975 inference.ipynb: adding code to remove waveglows bias 2019-03-15 16:54:54 -07:00
rafaelvalle
fc0d34cfce stft.py: moving window_sum to cuda if magnitude is cuda 2019-03-15 14:36:56 -07:00
Rafael Valle
f2c94d94fd Merge pull request #136 from GrzegorzKarchNV/master
Fixing concatenation error for fp16 distributed training
2019-02-01 12:10:42 -08:00
gkarch
df4a466af2 Fixing concatenation error for fp16 ditributed training 2019-02-01 09:55:59 +01:00
rafaelvalle
825ffa47d1 inference.ipynb: reverting fp16 inference for now 2018-12-08 21:26:01 -08:00
rafaelvalle
4d7b04120a inference.ipynb: changing waverglow inference fo fp16 2018-12-05 22:14:35 -08:00
rafaelvalle
6e430556bd train.py: val logger on gpu 0 only 2018-11-27 22:03:11 -08:00
rafaelvalle
3973b3e495 hparams.py: distributed using tcp 2018-11-27 22:02:43 -08:00
rafaelvalle
52a30bb7b6 distributed.py: replacing to avoid distributed error 2018-11-27 21:01:26 -08:00
rafaelvalle
0ad65cc053 train.py: renaming variable to n_gpus 2018-11-27 21:00:05 -08:00
rafaelvalle
8300844fa7 hparams.py: removing 22khz 2018-11-27 20:56:52 -08:00
rafaelvalle
f06063f746 train.py: renaming function, removing dataparallel 2018-11-27 18:04:12 -08:00
rafaelvalle
3045ba125b inference.ipynb: cleanup 2018-11-27 12:04:36 -08:00
rafaelvalle
4c4aca3662 README.md: layout 2018-11-27 11:59:05 -08:00
rafaelvalle
05dd8f91d2 README.md: adding submodule init to README 2018-11-27 11:55:40 -08:00
rafaelvalle
5d66c3deab adding waveglow submodule 2018-11-27 11:53:20 -08:00
Rafael Valle
f02704f338 Merge pull request #96 from NVIDIA/clean_slate
Clean slate
2018-11-27 08:06:00 -08:00
rafaelvalle
ba8cf36198 requirements.txt: removing pytorch 0.4 from requirements. upgrading to 1.0 2018-11-27 08:04:21 -08:00
rafaelvalle
b5e0a93946 inference.ipynb: updating inference file with relative paths 2018-11-27 08:04:04 -08:00
rafaelvalle
58b0ec61bd README.md: updating requirements and inference demo 2018-11-27 08:03:34 -08:00
rafaelvalle
1ad939df1a inference.ipynb: setting relative model paths 2018-11-27 07:45:09 -08:00
rafaelvalle
32b9a135d0 utils.py: updating 2018-11-25 22:34:38 -08:00
rafaelvalle
ce29e13959 train.py: updating 2018-11-25 22:34:34 -08:00
rafaelvalle
1ea6ed5861 text/symbols.py: updating symbols 2018-11-25 22:34:26 -08:00
rafaelvalle
cdfde985e5 text/__init__.py: remove stop token 2018-11-25 22:34:11 -08:00
rafaelvalle
e314bb4cd0 stft.py: fix filter winlength error 2018-11-25 22:33:52 -08:00
rafaelvalle
4af4ccb135 model.py: rewrite 2018-11-25 22:33:38 -08:00
rafaelvalle
1ec0e5e8cd layers.py: rewrite 2018-11-25 22:33:32 -08:00
rafaelvalle
249afd8043 inference.ipynb: import taco2model to be public 2018-11-25 22:33:16 -08:00
rafaelvalle
1b243d5d5a hparams.py:rewrite 2018-11-25 22:33:05 -08:00
rafaelvalle
d0aa9e7d32 distributed.py: rewrite 2018-11-25 22:32:54 -08:00
rafaelvalle
1683a57ae5 data_utils.py: rewrite 2018-11-25 22:32:47 -08:00
Rafael Valle
fc0cf6a89a Merge pull request #53 from cobr123/patch-1
add pillow
2018-07-02 14:32:21 -07:00
cobr123
8de38495be add pillow 2018-07-02 19:35:51 +03:00
rafaelvalle
7eb045206c README.md: updating readme to explicitly mention that mel representation of WaveNet and Tacotron2 must be the same 2018-06-14 11:25:42 -07:00
rafaelvalle
c67005f1be Dockerfile: adding jupyter to dockerfile 2018-06-14 10:30:01 -07:00
rafaelvalle
cb3794796f Dockerfile: removing return from Dockerfile 2018-06-12 21:38:03 -07:00
Rafael Valle
a8de973923 Merge pull request #37 from yoks/master
`used_saved_learning_rate`  fix
2018-06-12 09:00:14 -07:00
yoks
a0ae2da05f used_saved_learning_rate fix
`used_saved_learning_rate` name change to `use_saved_learning_rate`
2018-06-12 16:53:12 +03:00
rafaelvalle
34066ac4fc requirements.txt: setting torch to 0.4.0 2018-06-11 08:16:31 -07:00
rafaelvalle
12ab5ba89c model.py: setting weight initialization to xavier uniform 2018-06-07 20:28:52 -07:00
rafaelvalle
d10da5f41e hparams.py: commenting n_frames_per_step to indicate that currently only 1 frame per step is supported now 2018-06-07 13:02:23 -07:00
rafaelvalle
5f0ea06c41 hparams.py: adding use saved learning rate param 2018-06-05 08:12:49 -07:00
rafaelvalle
22bcff1155 hparams.py: adding use saved learning rate param 2018-06-05 08:12:35 -07:00
rafaelvalle
2e934478a4 README.md: being explicit about action 2018-06-04 17:12:32 -07:00
rafaelvalle
8ae231b10b README.md: more explicit about demo audio 2018-06-04 16:56:22 -07:00
rafaelvalle
4d733d1bdd README.md: including demo.wav in readme 2018-06-04 16:55:17 -07:00
rafaelvalle
b4e52404ab adding demo.wav file 2018-06-04 16:46:36 -07:00
Rafael Valle
064629c9bc Merge pull request #23 from NVIDIA/attention_full_mel
model.py: attending to full mel instead of prenet and dropout mel
2018-05-20 12:25:54 -07:00
Rafael Valle
d5b64729d1 model.py: moving for better readibility 2018-05-20 12:22:06 -07:00
Rafael Valle
977cb37cea model.py: attending to full mel instead of prenet and dropout mel 2018-05-18 06:59:09 -07:00
Rafael Valle
da30fd8709 Merge pull request #20 from NVIDIA/fp16_path
Fp16 patch, not path!
2018-05-15 09:55:19 -07:00
Rafael Valle
27b1767cb2 train.py: fixing typo 2018-05-15 09:53:33 -07:00
Rafael Valle
817cd403d4 Merge branch 'master' of https://github.com/NVIDIA/tacotron2 into load_mel_from_disk 2018-05-15 09:51:41 -07:00
Rafael Valle
bd42cb6ed7 Merge pull request #19 from NVIDIA/load_mel_from_disk
Load mel from disk
2018-05-15 08:54:24 -07:00
24 changed files with 1614 additions and 642 deletions

147
.gitignore vendored Normal file
View File

@@ -0,0 +1,147 @@
.env
env/
jupyter.json
run*/
filelists/
# Created by https://www.gitignore.io/api/macos
# Edit at https://www.gitignore.io/?templates=macos
### macOS ###
# General
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
# End of https://www.gitignore.io/api/macos
# Created by https://www.gitignore.io/api/python
# Edit at https://www.gitignore.io/?templates=python
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# Mr Developer
.mr.developer.cfg
.project
.pydevproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# End of https://www.gitignore.io/api/python

4
.gitmodules vendored Normal file
View File

@@ -0,0 +1,4 @@
[submodule "waveglow"]
path = waveglow
url = https://github.com/NVIDIA/waveglow
branch = master

View File

@@ -1,4 +1,10 @@
FROM pytorch/pytorch:0.4_cuda9_cudnn7
FROM pytorch/pytorch:nightly-devel-cuda10.0-cudnn7
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
RUN pip install numpy scipy matplotlib librosa==0.6.0 tensorflow tensorboardX
inflect==0.2.5 Unidecode==1.0.22
RUN apt-get update -y
RUN pip install numpy scipy matplotlib librosa==0.6.0 tensorflow tensorboardX inflect==0.2.5 Unidecode==1.0.22 jupyter
ADD apex /apex/
WORKDIR /apex/
RUN pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .

6
Makefile Normal file
View File

@@ -0,0 +1,6 @@
.PHONY: clean clean-test clean-pyc clean-build docs help common.mk
.DEFAULT_GOAL := help
notebook:
jupyter lab --ip=0.0.0.0 --no-browser --NotebookApp.token='${JUPYTER_TOKEN}'

55
README.md Normal file → Executable file
View File

@@ -1,13 +1,15 @@
# Tacotron 2 (without wavenet)
Tacotron 2 PyTorch implementation of [Natural TTS Synthesis By Conditioning
PyTorch implementation of [Natural TTS Synthesis By Conditioning
Wavenet On Mel Spectrogram Predictions](https://arxiv.org/pdf/1712.05884.pdf).
This implementation includes **distributed** and **fp16** support
This implementation includes **distributed** and **automatic mixed precision** support
and uses the [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/).
Distributed and FP16 support relies on work by Christian Sarofeen and NVIDIA's
[Apex Library](https://github.com/nvidia/apex).
Distributed and Automatic Mixed Precision support relies on NVIDIA's [Apex] and [AMP].
Visit our [website] for audio samples using our published [Tacotron 2] and
[WaveGlow] models.
![Alignment, Predicted Mel Spectrogram, Target Mel Spectrogram](tensorboard.png)
@@ -19,28 +21,44 @@ Distributed and FP16 support relies on work by Christian Sarofeen and NVIDIA's
1. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/)
2. Clone this repo: `git clone https://github.com/NVIDIA/tacotron2.git`
3. CD into this repo: `cd tacotron2`
4. Update .wav paths: `sed -i -- 's,DUMMY,ljs_dataset_folder/wavs,g' filelists/*.txt`
4. Initialize submodule: `git submodule init; git submodule update`
5. Update .wav paths: `sed -i -- 's,DUMMY,ljs_dataset_folder/wavs,g' filelists/*.txt`
- Alternatively, set `load_mel_from_disk=True` in `hparams.py` and update mel-spectrogram paths
5. Install [pytorch 0.4](https://github.com/pytorch/pytorch)
6. Install python requirements or build docker image
6. Install [PyTorch 1.0]
7. Install [Apex]
8. Install python requirements or build docker image
- Install python requirements: `pip install -r requirements.txt`
- **OR**
- Build docker image: `docker build --tag tacotron2 .`
## Training
1. `python train.py --output_directory=outdir --log_directory=logdir`
2. (OPTIONAL) `tensorboard --logdir=outdir/logdir`
## Multi-GPU (distributed) and FP16 Training
## Training using a pre-trained model
Training using a pre-trained model can lead to faster convergence
By default, the dataset dependent text embedding layers are [ignored]
1. Download our published [Tacotron 2] model
2. `python train.py --output_directory=outdir --log_directory=logdir -c tacotron2_statedict.pt --warm_start`
## Multi-GPU (distributed) and Automatic Mixed Precision Training
1. `python -m multiproc train.py --output_directory=outdir --log_directory=logdir --hparams=distributed_run=True,fp16_run=True`
## Inference
1. `jupyter notebook --ip=127.0.0.1 --port=31337`
2. load inference.ipynb
## Inference demo
1. Download our published [Tacotron 2] model
2. Download our published [WaveGlow] model
3. `jupyter notebook --ip=127.0.0.1 --port=31337`
4. Load inference.ipynb
N.b. When performing Mel-Spectrogram to Audio synthesis, make sure Tacotron 2
and the Mel decoder were trained on the same mel-spectrogram representation.
## Related repos
[nv-wavenet](https://github.com/NVIDIA/nv-wavenet/): Faster than real-time
wavenet inference
[WaveGlow](https://github.com/NVIDIA/WaveGlow) Faster than real time Flow-based
Generative Network for Speech Synthesis
[nv-wavenet](https://github.com/NVIDIA/nv-wavenet/) Faster than real time
WaveNet.
## Acknowledgements
This implementation uses code from the following repos: [Keith
@@ -54,3 +72,10 @@ We are thankful to the Tacotron 2 paper authors, specially Jonathan Shen, Yuxuan
Wang and Zongheng Yang.
[WaveGlow]: https://drive.google.com/file/d/1WsibBTsuRg_SF2Z6L6NFRTT-NjEy1oTx/view?usp=sharing
[Tacotron 2]: https://drive.google.com/file/d/1c5ZTuT7J08wLUoVZ2KkUs_VdZuJ86ZqA/view?usp=sharing
[pytorch 1.0]: https://github.com/pytorch/pytorch#installation
[website]: https://nv-adlr.github.io/WaveGlow
[ignored]: https://github.com/NVIDIA/tacotron2/blob/master/hparams.py#L22
[Apex]: https://github.com/nvidia/apex
[AMP]: https://github.com/NVIDIA/apex/tree/master/apex/amp

27
common.mk Normal file
View File

@@ -0,0 +1,27 @@
define BROWSER_PYSCRIPT
import os, webbrowser, sys
try:
from urllib import pathname2url
except:
from urllib.request import pathname2url
webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
endef
export BROWSER_PYSCRIPT
define PRINT_HELP_PYSCRIPT
import re, sys
for line in sys.stdin:
match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
if match:
target, help = match.groups()
print("%-20s %s" % (target, help))
endef
export PRINT_HELP_PYSCRIPT
BROWSER := python -c "$$BROWSER_PYSCRIPT"
help: ## make TARGET forwards the TARGET to sub packages
@cat $(MAKEFILE_LIST) | python -c "$$PRINT_HELP_PYSCRIPT"

View File

@@ -5,8 +5,8 @@ import torch.utils.data
import layers
from utils import load_wav_to_torch, load_filepaths_and_text
from text import text_to_sequence
# from text import text_to_sequence
from text_codec import text_to_sequence
class TextMelLoader(torch.utils.data.Dataset):
"""
@@ -14,9 +14,8 @@ class TextMelLoader(torch.utils.data.Dataset):
2) normalizes text and converts them to sequences of one-hot vectors
3) computes mel-spectrograms from audio files.
"""
def __init__(self, audiopaths_and_text, hparams, shuffle=True):
self.audiopaths_and_text = load_filepaths_and_text(
audiopaths_and_text, hparams.sort_by_length)
def __init__(self, audiopaths_and_text, hparams):
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
self.text_cleaners = hparams.text_cleaners
self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate
@@ -26,8 +25,7 @@ class TextMelLoader(torch.utils.data.Dataset):
hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
hparams.mel_fmax)
random.seed(1234)
if shuffle:
random.shuffle(self.audiopaths_and_text)
random.shuffle(self.audiopaths_and_text)
def get_mel_text_pair(self, audiopath_and_text):
# separate filename and text
@@ -38,7 +36,10 @@ class TextMelLoader(torch.utils.data.Dataset):
def get_mel(self, filename):
if not self.load_mel_from_disk:
audio = load_wav_to_torch(filename, self.sampling_rate)
audio, sampling_rate = load_wav_to_torch(filename)
if sampling_rate != self.stft.sampling_rate:
raise ValueError("{} {} SR doesn't match target {} SR".format(
sampling_rate, self.stft.sampling_rate))
audio_norm = audio / self.max_wav_value
audio_norm = audio_norm.unsqueeze(0)
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
@@ -87,9 +88,9 @@ class TextMelCollate():
text = batch[ids_sorted_decreasing[i]][0]
text_padded[i, :text.size(0)] = text
# Right zero-pad mel-spec with extra single zero vector to mark the end
# Right zero-pad mel-spec
num_mels = batch[0][1].size(0)
max_target_len = max([x[1].size(1) for x in batch]) + 1
max_target_len = max([x[1].size(1) for x in batch])
if max_target_len % self.n_frames_per_step != 0:
max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
assert max_target_len % self.n_frames_per_step == 0
@@ -103,7 +104,7 @@ class TextMelCollate():
for i in range(len(ids_sorted_decreasing)):
mel = batch[ids_sorted_decreasing[i]][1]
mel_padded[i, :, :mel.size(1)] = mel
gate_padded[i, mel.size(1):] = 1
gate_padded[i, mel.size(1)-1:] = 1
output_lengths[i] = mel.size(1)
return text_padded, input_lengths, mel_padded, gate_padded, \

BIN
demo.wav Executable file

Binary file not shown.

View File

@@ -1,6 +1,7 @@
import torch
import torch.distributed as dist
from torch.nn.modules import Module
from torch.autograd import Variable
def _flatten_dense_tensors(tensors):
"""Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
@@ -118,3 +119,55 @@ class DistributedDataParallel(Module):
super(DistributedDataParallel, self).train(mode)
self.module.train(mode)
'''
'''
Modifies existing model to do gradient allreduce, but doesn't change class
so you don't need "module"
'''
def apply_gradient_allreduce(module):
if not hasattr(dist, '_backend'):
module.warn_on_half = True
else:
module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
for p in module.state_dict().values():
if not torch.is_tensor(p):
continue
dist.broadcast(p, 0)
def allreduce_params():
if(module.needs_reduction):
module.needs_reduction = False
buckets = {}
for param in module.parameters():
if param.requires_grad and param.grad is not None:
tp = param.data.dtype
if tp not in buckets:
buckets[tp] = []
buckets[tp].append(param)
if module.warn_on_half:
if torch.cuda.HalfTensor in buckets:
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
" It is recommended to use the NCCL backend in this case. This currently requires" +
"PyTorch built from top of tree master.")
module.warn_on_half = False
for tp in buckets:
bucket = buckets[tp]
grads = [param.grad.data for param in bucket]
coalesced = _flatten_dense_tensors(grads)
dist.all_reduce(coalesced)
coalesced /= dist.get_world_size()
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
buf.copy_(synced)
for param in list(module.parameters()):
def allreduce_hook(*unused):
Variable._execution_engine.queue_callback(allreduce_params)
if param.requires_grad:
param.register_hook(allreduce_hook)
def set_needs_reduction(self, input, output):
self.needs_reduction = True
module.register_forward_hook(set_needs_reduction)
return module

View File

@@ -1,381 +0,0 @@
import torch
from torch import nn
from torch.autograd import Variable
from torch.nn.parameter import Parameter
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from loss_scaler import DynamicLossScaler, LossScaler
FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
def conversion_helper(val, conversion):
"""Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure."""
if not isinstance(val, (tuple, list)):
return conversion(val)
rtn = [conversion_helper(v, conversion) for v in val]
if isinstance(val, tuple):
rtn = tuple(rtn)
return rtn
def fp32_to_fp16(val):
"""Convert fp32 `val` to fp16"""
def half_conversion(val):
val_typecheck = val
if isinstance(val_typecheck, (Parameter, Variable)):
val_typecheck = val.data
if isinstance(val_typecheck, FLOAT_TYPES):
val = val.half()
return val
return conversion_helper(val, half_conversion)
def fp16_to_fp32(val):
"""Convert fp16 `val` to fp32"""
def float_conversion(val):
val_typecheck = val
if isinstance(val_typecheck, (Parameter, Variable)):
val_typecheck = val.data
if isinstance(val_typecheck, HALF_TYPES):
val = val.float()
return val
return conversion_helper(val, float_conversion)
class FP16_Module(nn.Module):
def __init__(self, module):
super(FP16_Module, self).__init__()
self.add_module('module', module.half())
def forward(self, *inputs, **kwargs):
return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs))
class FP16_Optimizer(object):
"""
FP16_Optimizer is designed to wrap an existing PyTorch optimizer,
and enable an fp16 model to be trained using a master copy of fp32 weights.
Args:
optimizer (torch.optim.optimizer): Existing optimizer containing initialized fp16 parameters. Internally, FP16_Optimizer replaces the passed optimizer's fp16 parameters with new fp32 parameters copied from the original ones. FP16_Optimizer also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy after each step.
static_loss_scale (float, optional, default=1.0): Loss scale used internally to scale fp16 gradients computed by the model. Scaled gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so static_loss_scale should not affect learning rate.
dynamic_loss_scale (bool, optional, default=False): Use dynamic loss scaling. If True, this will override any static_loss_scale option.
"""
def __init__(self, optimizer, static_loss_scale=1.0, dynamic_loss_scale=False):
if not torch.cuda.is_available:
raise SystemError('Cannot use fp16 without CUDA')
self.fp16_param_groups = []
self.fp32_param_groups = []
self.fp32_flattened_groups = []
for i, param_group in enumerate(optimizer.param_groups):
print("FP16_Optimizer processing param group {}:".format(i))
fp16_params_this_group = []
fp32_params_this_group = []
for param in param_group['params']:
if param.requires_grad:
if param.type() == 'torch.cuda.HalfTensor':
print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
.format(param.size()))
fp16_params_this_group.append(param)
elif param.type() == 'torch.cuda.FloatTensor':
print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
.format(param.size()))
fp32_params_this_group.append(param)
else:
raise TypeError("Wrapped parameters must be either "
"torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
"Received {}".format(param.type()))
fp32_flattened_this_group = None
if len(fp16_params_this_group) > 0:
fp32_flattened_this_group = _flatten_dense_tensors(
[param.detach().data.clone().float() for param in fp16_params_this_group])
fp32_flattened_this_group = Variable(fp32_flattened_this_group, requires_grad = True)
fp32_flattened_this_group.grad = fp32_flattened_this_group.new(
*fp32_flattened_this_group.size())
# python's lovely list concatenation via +
if fp32_flattened_this_group is not None:
param_group['params'] = [fp32_flattened_this_group] + fp32_params_this_group
else:
param_group['params'] = fp32_params_this_group
self.fp16_param_groups.append(fp16_params_this_group)
self.fp32_param_groups.append(fp32_params_this_group)
self.fp32_flattened_groups.append(fp32_flattened_this_group)
# print("self.fp32_flattened_groups = ", self.fp32_flattened_groups)
# print("self.fp16_param_groups = ", self.fp16_param_groups)
self.optimizer = optimizer.__class__(optimizer.param_groups)
# self.optimizer.load_state_dict(optimizer.state_dict())
self.param_groups = self.optimizer.param_groups
if dynamic_loss_scale:
self.dynamic_loss_scale = True
self.loss_scaler = DynamicLossScaler()
else:
self.dynamic_loss_scale = False
self.loss_scaler = LossScaler(static_loss_scale)
self.overflow = False
self.first_closure_call_this_step = True
def zero_grad(self):
"""
Zero fp32 and fp16 parameter grads.
"""
self.optimizer.zero_grad()
for fp16_group in self.fp16_param_groups:
for param in fp16_group:
if param.grad is not None:
param.grad.detach_() # This does appear in torch.optim.optimizer.zero_grad(),
# but I'm not sure why it's needed.
param.grad.zero_()
def _check_overflow(self):
params = []
for group in self.fp16_param_groups:
for param in group:
params.append(param)
for group in self.fp32_param_groups:
for param in group:
params.append(param)
self.overflow = self.loss_scaler.has_overflow(params)
def _update_scale(self, has_overflow=False):
self.loss_scaler.update_scale(has_overflow)
def _copy_grads_fp16_to_fp32(self):
for fp32_group, fp16_group in zip(self.fp32_flattened_groups, self.fp16_param_groups):
if len(fp16_group) > 0:
# This might incur one more deep copy than is necessary.
fp32_group.grad.data.copy_(
_flatten_dense_tensors([fp16_param.grad.data for fp16_param in fp16_group]))
def _downscale_fp32(self):
if self.loss_scale != 1.0:
for param_group in self.optimizer.param_groups:
for param in param_group['params']:
param.grad.data.mul_(1./self.loss_scale)
def clip_fp32_grads(self, clip=-1):
if not self.overflow:
fp32_params = []
for param_group in self.optimizer.param_groups:
for param in param_group['params']:
fp32_params.append(param)
if clip > 0:
return torch.nn.utils.clip_grad_norm(fp32_params, clip)
def _copy_params_fp32_to_fp16(self):
for fp16_group, fp32_group in zip(self.fp16_param_groups, self.fp32_flattened_groups):
if len(fp16_group) > 0:
for fp16_param, fp32_data in zip(fp16_group,
_unflatten_dense_tensors(fp32_group.data, fp16_group)):
fp16_param.data.copy_(fp32_data)
def state_dict(self):
"""
Returns a dict containing the current state of this FP16_Optimizer instance.
This dict contains attributes of FP16_Optimizer, as well as the state_dict
of the contained Pytorch optimizer.
Untested.
"""
state_dict = {}
state_dict['loss_scaler'] = self.loss_scaler
state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
state_dict['overflow'] = self.overflow
state_dict['first_closure_call_this_step'] = self.first_closure_call_this_step
state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
return state_dict
def load_state_dict(self, state_dict):
"""
Loads a state_dict created by an earlier call to state_dict.
Untested.
"""
self.loss_scaler = state_dict['loss_scaler']
self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
self.overflow = state_dict['overflow']
self.first_closure_call_this_step = state_dict['first_closure_call_this_step']
self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
def step(self, closure=None): # could add clip option.
"""
If no closure is supplied, step should be called after fp16_optimizer_obj.backward(loss).
step updates the fp32 master copy of parameters using the optimizer supplied to
FP16_Optimizer's constructor, then copies the updated fp32 params into the fp16 params
originally referenced by Fp16_Optimizer's constructor, so the user may immediately run
another forward pass using their model.
If a closure is supplied, step may be called without a prior call to self.backward(loss).
However, the user should take care that any loss.backward() call within the closure
has been replaced by fp16_optimizer_obj.backward(loss).
Args:
closure (optional): Closure that will be supplied to the underlying optimizer originally passed to FP16_Optimizer's constructor. closure should call zero_grad on the FP16_Optimizer object, compute the loss, call .backward(loss), and return the loss.
Closure example::
# optimizer is assumed to be an FP16_Optimizer object, previously constructed from an
# existing pytorch optimizer.
for input, target in dataset:
def closure():
optimizer.zero_grad()
output = model(input)
loss = loss_fn(output, target)
optimizer.backward(loss)
return loss
optimizer.step(closure)
.. note::
The only changes that need to be made compared to
`ordinary optimizer closures`_ are that "optimizer" itself should be an instance of
FP16_Optimizer, and that the call to loss.backward should be replaced by
optimizer.backward(loss).
.. warning::
Currently, calling step with a closure is not compatible with dynamic loss scaling.
.. _`ordinary optimizer closures`:
http://pytorch.org/docs/master/optim.html#optimizer-step-closure
"""
if closure is not None and isinstance(self.loss_scaler, DynamicLossScaler):
raise TypeError("Using step with a closure is currently not "
"compatible with dynamic loss scaling.")
scale = self.loss_scaler.loss_scale
self._update_scale(self.overflow)
if self.overflow:
print("OVERFLOW! Skipping step. Attempted loss scale: {}".format(scale))
return
if closure is not None:
self._step_with_closure(closure)
else:
self.optimizer.step()
self._copy_params_fp32_to_fp16()
return
def _step_with_closure(self, closure):
def wrapped_closure():
if self.first_closure_call_this_step:
"""
We expect that the fp16 params are initially fresh on entering self.step(),
so _copy_params_fp32_to_fp16() is unnecessary the first time wrapped_closure()
is called within self.optimizer.step().
"""
self.first_closure_call_this_step = False
else:
"""
If self.optimizer.step() internally calls wrapped_closure more than once,
it may update the fp32 params after each call. However, self.optimizer
doesn't know about the fp16 params at all. If the fp32 params get updated,
we can't rely on self.optimizer to refresh the fp16 params. We need
to handle that manually:
"""
self._copy_params_fp32_to_fp16()
"""
Our API expects the user to give us ownership of the backward() call by
replacing all calls to loss.backward() with optimizer.backward(loss).
This requirement holds whether or not the call to backward() is made within
a closure.
If the user is properly calling optimizer.backward(loss) within "closure,"
calling closure() here will give the fp32 master params fresh gradients
for the optimizer to play with,
so all wrapped_closure needs to do is call closure() and return the loss.
"""
temp_loss = closure()
return temp_loss
self.optimizer.step(wrapped_closure)
self.first_closure_call_this_step = True
def backward(self, loss, update_fp32_grads=True):
"""
fp16_optimizer_obj.backward performs the following conceptual operations:
fp32_loss = loss.float() (see first Note below)
scaled_loss = fp32_loss*loss_scale
scaled_loss.backward(), which accumulates scaled gradients into the .grad attributes of the
fp16 model's leaves.
fp16 grads are then copied to the stored fp32 params' .grad attributes (see second Note).
Finally, fp32 grads are divided by loss_scale.
In this way, after fp16_optimizer_obj.backward, the fp32 parameters have fresh gradients,
and fp16_optimizer_obj.step may be called.
.. note::
Converting the loss to fp32 before applying the loss scale provides some
additional safety against overflow if the user has supplied an fp16 value.
However, for maximum overflow safety, the user should
compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to
fp16_optimizer_obj.backward.
.. note::
The gradients found in an fp16 model's leaves after a call to
fp16_optimizer_obj.backward should not be regarded as valid in general,
because it's possible
they have been scaled (and in the case of dynamic loss scaling,
the scale factor may silently change over time).
If the user wants to inspect gradients after a call to fp16_optimizer_obj.backward,
he/she should query the .grad attribute of FP16_Optimizer's stored fp32 parameters.
Args:
loss: The loss output by the user's model. loss may be either float or half (but see first Note above).
update_fp32_grads (bool, optional, default=True): Option to copy fp16 grads to fp32 grads on this call. By setting this to False, the user can delay this copy, which is useful to eliminate redundant fp16->fp32 grad copies if fp16_optimizer_obj.backward is being called on multiple losses in one iteration. If set to False, the user becomes responsible for calling fp16_optimizer_obj.update_fp32_grads before calling fp16_optimizer_obj.step.
Example::
# Ordinary operation:
optimizer.backward(loss)
# Naive operation with multiple losses (technically valid, but less efficient):
# fp32 grads will be correct after the second call, but
# the first call incurs an unnecessary fp16->fp32 grad copy.
optimizer.backward(loss1)
optimizer.backward(loss2)
# More efficient way to handle multiple losses:
# The fp16->fp32 grad copy is delayed until fp16 grads from all
# losses have been accumulated.
optimizer.backward(loss1, update_fp32_grads=False)
optimizer.backward(loss2, update_fp32_grads=False)
optimizer.update_fp32_grads()
"""
self.loss_scaler.backward(loss.float())
if update_fp32_grads:
self.update_fp32_grads()
def update_fp32_grads(self):
"""
Copy the .grad attribute from stored references to fp16 parameters to
the .grad attribute of the master fp32 parameters that are directly
updated by the optimizer. :attr:`update_fp32_grads` only needs to be called if
fp16_optimizer_obj.backward was called with update_fp32_grads=False.
"""
if self.dynamic_loss_scale:
self._check_overflow()
if self.overflow: return
self._copy_grads_fp16_to_fp32()
self._downscale_fp32()
@property
def loss_scale(self):
return self.loss_scaler.loss_scale

View File

@@ -10,15 +10,16 @@ def create_hparams(hparams_string=None, verbose=False):
# Experiment Parameters #
################################
epochs=500,
iters_per_checkpoint=500,
iters_per_checkpoint=1000,
seed=1234,
dynamic_loss_scaling=True,
fp16_run=False,
distributed_run=False,
dist_backend="nccl",
dist_url="file://distributed.dpt",
dist_url="tcp://localhost:54321",
cudnn_enabled=True,
cudnn_benchmark=False,
ignore_layers=['embedding.weight'],
################################
# Data Parameters #
@@ -27,7 +28,6 @@ def create_hparams(hparams_string=None, verbose=False):
training_files='filelists/ljs_audio_text_train_filelist.txt',
validation_files='filelists/ljs_audio_text_val_filelist.txt',
text_cleaners=['english_cleaners'],
sort_by_length=False,
################################
# Audio Parameters #
@@ -37,14 +37,14 @@ def create_hparams(hparams_string=None, verbose=False):
filter_length=1024,
hop_length=256,
win_length=1024,
n_mel_channels=80,
n_mel_channels=40,
mel_fmin=0.0,
mel_fmax=None, # if None, half the sampling rate
mel_fmax=4000.0,
################################
# Model Parameters #
################################
n_symbols=len(symbols),
n_symbols=1000,#len(symbols),
symbols_embedding_dim=512,
# Encoder parameters
@@ -53,11 +53,13 @@ def create_hparams(hparams_string=None, verbose=False):
encoder_embedding_dim=512,
# Decoder parameters
n_frames_per_step=1,
n_frames_per_step=1, # currently only 1 is supported
decoder_rnn_dim=1024,
prenet_dim=256,
max_decoder_steps=1000,
gate_threshold=0.6,
gate_threshold=0.5,
p_attention_dropout=0.1,
p_decoder_dropout=0.1,
# Attention parameters
attention_rnn_dim=1024,
@@ -75,11 +77,12 @@ def create_hparams(hparams_string=None, verbose=False):
################################
# Optimization Hyperparameters #
################################
use_saved_learning_rate=False,
learning_rate=1e-3,
weight_decay=1e-6,
grad_clip_thresh=1,
batch_size=48,
mask_padding=False # set model's padded outputs to padded values
grad_clip_thresh=1.0,
batch_size=64,
mask_padding=True # set model's padded outputs to padded values
)
if hparams_string:

File diff suppressed because one or more lines are too long

View File

@@ -10,7 +10,7 @@ class LinearNorm(torch.nn.Module):
super(LinearNorm, self).__init__()
self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
torch.nn.init.xavier_uniform(
torch.nn.init.xavier_uniform_(
self.linear_layer.weight,
gain=torch.nn.init.calculate_gain(w_init_gain))
@@ -31,7 +31,7 @@ class ConvNorm(torch.nn.Module):
padding=padding, dilation=dilation,
bias=bias)
torch.nn.init.xavier_uniform(
torch.nn.init.xavier_uniform_(
self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
def forward(self, signal):
@@ -42,7 +42,7 @@ class ConvNorm(torch.nn.Module):
class TacotronSTFT(torch.nn.Module):
def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
mel_fmax=None):
mel_fmax=8000.0):
super(TacotronSTFT, self).__init__()
self.n_mel_channels = n_mel_channels
self.sampling_rate = sampling_rate

View File

@@ -1,5 +1,5 @@
import random
import torch.nn.functional as F
import torch
from tensorboardX import SummaryWriter
from plotting_utils import plot_alignment_to_numpy, plot_spectrogram_to_numpy
from plotting_utils import plot_gate_outputs_to_numpy
@@ -44,5 +44,5 @@ class Tacotron2Logger(SummaryWriter):
"gate",
plot_gate_outputs_to_numpy(
gate_targets[idx].data.cpu().numpy(),
F.sigmoid(gate_outputs[idx]).data.cpu().numpy()),
torch.sigmoid(gate_outputs[idx]).data.cpu().numpy()),
iteration)

View File

@@ -1,10 +1,10 @@
from math import sqrt
import torch
from torch.autograd import Variable
from torch import nn
from torch.nn import functional as F
from layers import ConvNorm, LinearNorm
from utils import to_gpu, get_mask_from_lengths
from fp16_optimizer import fp32_to_fp16, fp16_to_fp32
class LocationLayer(nn.Module):
@@ -56,7 +56,7 @@ class Attention(nn.Module):
processed_query = self.query_layer(query.unsqueeze(1))
processed_attention_weights = self.location_layer(attention_weights_cat)
energies = self.v(F.tanh(
energies = self.v(torch.tanh(
processed_query + processed_attention_weights + processed_memory))
energies = energies.squeeze(-1)
@@ -107,7 +107,6 @@ class Postnet(nn.Module):
def __init__(self, hparams):
super(Postnet, self).__init__()
self.dropout = nn.Dropout(0.5)
self.convolutions = nn.ModuleList()
self.convolutions.append(
@@ -141,9 +140,8 @@ class Postnet(nn.Module):
def forward(self, x):
for i in range(len(self.convolutions) - 1):
x = self.dropout(F.tanh(self.convolutions[i](x)))
x = self.dropout(self.convolutions[-1](x))
x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training)
x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
return x
@@ -155,7 +153,6 @@ class Encoder(nn.Module):
"""
def __init__(self, hparams):
super(Encoder, self).__init__()
self.dropout = nn.Dropout(0.5)
convolutions = []
for _ in range(hparams.encoder_n_convolutions):
@@ -175,7 +172,7 @@ class Encoder(nn.Module):
def forward(self, x, input_lengths):
for conv in self.convolutions:
x = self.dropout(F.relu(conv(x)))
x = F.dropout(F.relu(conv(x)), 0.5, self.training)
x = x.transpose(1, 2)
@@ -194,7 +191,7 @@ class Encoder(nn.Module):
def inference(self, x):
for conv in self.convolutions:
x = self.dropout(F.relu(conv(x)))
x = F.dropout(F.relu(conv(x)), 0.5, self.training)
x = x.transpose(1, 2)
@@ -215,6 +212,8 @@ class Decoder(nn.Module):
self.prenet_dim = hparams.prenet_dim
self.max_decoder_steps = hparams.max_decoder_steps
self.gate_threshold = hparams.gate_threshold
self.p_attention_dropout = hparams.p_attention_dropout
self.p_decoder_dropout = hparams.p_decoder_dropout
self.prenet = Prenet(
hparams.n_mel_channels * hparams.n_frames_per_step,
@@ -235,7 +234,7 @@ class Decoder(nn.Module):
self.linear_projection = LinearNorm(
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
hparams.n_mel_channels*hparams.n_frames_per_step)
hparams.n_mel_channels * hparams.n_frames_per_step)
self.gate_layer = LinearNorm(
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1,
@@ -350,11 +349,11 @@ class Decoder(nn.Module):
gate_output: gate output energies
attention_weights:
"""
decoder_input = self.prenet(decoder_input)
cell_input = torch.cat((decoder_input, self.attention_context), -1)
self.attention_hidden, self.attention_cell = self.attention_rnn(
cell_input, (self.attention_hidden, self.attention_cell))
self.attention_hidden = F.dropout(
self.attention_hidden, self.p_attention_dropout, self.training)
attention_weights_cat = torch.cat(
(self.attention_weights.unsqueeze(1),
@@ -368,6 +367,8 @@ class Decoder(nn.Module):
(self.attention_hidden, self.attention_context), -1)
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
decoder_input, (self.decoder_hidden, self.decoder_cell))
self.decoder_hidden = F.dropout(
self.decoder_hidden, self.p_decoder_dropout, self.training)
decoder_hidden_attention_context = torch.cat(
(self.decoder_hidden, self.attention_context), dim=1)
@@ -392,22 +393,23 @@ class Decoder(nn.Module):
alignments: sequence of attention weights from the decoder
"""
decoder_input = self.get_go_frame(memory)
decoder_input = self.get_go_frame(memory).unsqueeze(0)
decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
decoder_inputs = self.prenet(decoder_inputs)
self.initialize_decoder_states(
memory, mask=~get_mask_from_lengths(memory_lengths))
mel_outputs, gate_outputs, alignments = [], [], []
while len(mel_outputs) < decoder_inputs.size(0):
while len(mel_outputs) < decoder_inputs.size(0) - 1:
decoder_input = decoder_inputs[len(mel_outputs)]
mel_output, gate_output, attention_weights = self.decode(
decoder_input)
mel_outputs += [mel_output]
gate_outputs += [gate_output.squeeze(1)]
mel_outputs += [mel_output.squeeze(1)]
gate_outputs += [gate_output.squeeze()]
alignments += [attention_weights]
decoder_input = decoder_inputs[len(mel_outputs) - 1]
mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
mel_outputs, gate_outputs, alignments)
@@ -431,13 +433,14 @@ class Decoder(nn.Module):
mel_outputs, gate_outputs, alignments = [], [], []
while True:
decoder_input = self.prenet(decoder_input)
mel_output, gate_output, alignment = self.decode(decoder_input)
mel_outputs += [mel_output]
gate_outputs += [gate_output.squeeze(1)]
mel_outputs += [mel_output.squeeze(1)]
gate_outputs += [gate_output]
alignments += [alignment]
if F.sigmoid(gate_output.data) > self.gate_threshold:
if torch.sigmoid(gate_output.data) > self.gate_threshold:
break
elif len(mel_outputs) == self.max_decoder_steps:
print("Warning! Reached max decoder steps")
@@ -460,6 +463,9 @@ class Tacotron2(nn.Module):
self.n_frames_per_step = hparams.n_frames_per_step
self.embedding = nn.Embedding(
hparams.n_symbols, hparams.symbols_embedding_dim)
std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))
val = sqrt(3.0) * std # uniform bounds for std
self.embedding.weight.data.uniform_(-val, val)
self.encoder = Encoder(hparams)
self.decoder = Decoder(hparams)
self.postnet = Postnet(hparams)
@@ -468,8 +474,8 @@ class Tacotron2(nn.Module):
text_padded, input_lengths, mel_padded, gate_padded, \
output_lengths = batch
text_padded = to_gpu(text_padded).long()
max_len = int(torch.max(input_lengths.data).numpy())
input_lengths = to_gpu(input_lengths).long()
max_len = torch.max(input_lengths.data).item()
mel_padded = to_gpu(mel_padded).float()
gate_padded = to_gpu(gate_padded).float()
output_lengths = to_gpu(output_lengths).long()
@@ -478,13 +484,9 @@ class Tacotron2(nn.Module):
(text_padded, input_lengths, mel_padded, max_len, output_lengths),
(mel_padded, gate_padded))
def parse_input(self, inputs):
inputs = fp32_to_fp16(inputs) if self.fp16_run else inputs
return inputs
def parse_output(self, outputs, output_lengths=None):
if self.mask_padding and output_lengths is not None:
mask = ~get_mask_from_lengths(output_lengths+1) # +1 <stop> token
mask = ~get_mask_from_lengths(output_lengths)
mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
mask = mask.permute(1, 0, 2)
@@ -492,39 +494,27 @@ class Tacotron2(nn.Module):
outputs[1].data.masked_fill_(mask, 0.0)
outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies
outputs = fp16_to_fp32(outputs) if self.fp16_run else outputs
return outputs
def forward(self, inputs):
inputs, input_lengths, targets, max_len, \
output_lengths = self.parse_input(inputs)
input_lengths, output_lengths = input_lengths.data, output_lengths.data
text_inputs, text_lengths, mels, max_len, output_lengths = inputs
text_lengths, output_lengths = text_lengths.data, output_lengths.data
embedded_inputs = self.embedding(inputs).transpose(1, 2)
embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
encoder_outputs = self.encoder(embedded_inputs, input_lengths)
encoder_outputs = self.encoder(embedded_inputs, text_lengths)
mel_outputs, gate_outputs, alignments = self.decoder(
encoder_outputs, targets, memory_lengths=input_lengths)
encoder_outputs, mels, memory_lengths=text_lengths)
mel_outputs_postnet = self.postnet(mel_outputs)
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
# DataParallel expects equal sized inputs/outputs, hence padding
if input_lengths is not None:
alignments = alignments.unsqueeze(0)
alignments = nn.functional.pad(
alignments,
(0, max_len - alignments.size(3), 0, 0),
"constant", 0)
alignments = alignments.squeeze()
return self.parse_output(
[mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
output_lengths)
def inference(self, inputs):
inputs = self.parse_input(inputs)
embedded_inputs = self.embedding(inputs).transpose(1, 2)
encoder_outputs = self.encoder.inference(embedded_inputs)
mel_outputs, gate_outputs, alignments = self.decoder.inference(

View File

@@ -1,9 +1,9 @@
torch==0.2.0.post3
matplotlib==2.1.0
tensorflow
numpy==1.13.3
tensorflow==1.1.0
numpy==1.17.1
inflect==0.2.5
librosa==0.6.0
scipy==1.0.0
tensorboardX==1.1
Unidecode==1.0.22
pillow

1000
spm.vocab Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -61,7 +61,7 @@ class STFT(torch.nn.Module):
np.linalg.pinv(scale * fourier_basis).T[:, None, :])
if window is not None:
assert(win_length >= filter_length)
assert(filter_length >= win_length)
# get window and zero center pad it to filter_length
fft_window = get_window(window, win_length, fftbins=True)
fft_window = pad_center(fft_window, filter_length)
@@ -124,6 +124,7 @@ class STFT(torch.nn.Module):
np.where(window_sum > tiny(window_sum))[0])
window_sum = torch.autograd.Variable(
torch.from_numpy(window_sum), requires_grad=False)
window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
# scale by hop ratio

View File

@@ -37,8 +37,6 @@ def text_to_sequence(text, cleaner_names):
sequence += _arpabet_to_sequence(m.group(2))
text = m.group(3)
# Append EOS token
sequence.append(_symbol_to_id['~'])
return sequence

View File

@@ -7,11 +7,12 @@ The default is a set of ASCII characters that works well for English or text tha
from text import cmudict
_pad = '_'
_eos = '~'
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
_punctuation = '!\'(),.:;? '
_special = '-'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
_arpabet = ['@' + s for s in cmudict.valid_symbols]
# Export all symbols:
symbols = [_pad, _eos] + list(_characters) + _arpabet
symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet

77
text_codec.py Normal file
View File

@@ -0,0 +1,77 @@
from utils import load_filepaths_and_text
from text import text_to_sequence, sequence_to_text
from hparams import create_hparams
import sentencepiece as spm
from text import symbols
from bpemb import BPEmb
SPM_CORPUS_FILE = "filelists/text_corpus.txt"
SPM_MODEL_PREFIX = "spm"
SPM_VOCAB_SIZE = 1000
hparams = create_hparams()
def _create_sentencepiece_corpus():
def get_text_list(text_file):
return [i[1] + "\n" for i in load_filepaths_and_text(text_file)]
full_text_list = get_text_list(hparams.training_files) + get_text_list(
hparams.validation_files
)
with open(SPM_CORPUS_FILE, "w") as fd:
fd.writelines(full_text_list)
def _create_sentencepiece_vocab(vocab_size=SPM_VOCAB_SIZE):
train_params = "--input={} --model_type=unigram --character_coverage=1.0 --model_prefix={} --vocab_size={}".format(
SPM_CORPUS_FILE, SPM_MODEL_PREFIX, vocab_size
)
spm.SentencePieceTrainer.Train(train_params)
def _spm_text_codecs():
sp = spm.SentencePieceProcessor()
sp.Load("{}.model".format(SPM_MODEL_PREFIX))
def ttseq(text, cleaners):
return sp.EncodeAsIds(text)
def seqtt(sequence):
return sp.DecodeIds(sequence)
return ttseq, seqtt
def _bpemb_text_codecs():
bpemb_en = BPEmb(lang="en", dim=50, vs=148)
def ttseq(text, cleaners):
return bpemb_en.encode_ids(text)
def seqtt(sequence):
return bpemb_en.decode_ids(sequence)
return ttseq, seqtt
# text_to_sequence, sequence_to_text = _spm_text_codecs()
text_to_sequence, sequence_to_text = _bpemb_text_codecs()
def _interactive_test():
prompt = "Hello world; how are you, doing ?"
while prompt not in ["q", "quit"]:
oup = sequence_to_text(text_to_sequence(prompt, hparams.text_cleaners))
print('==> ',oup)
prompt = input("> ")
def main():
# _create_sentencepiece_corpus()
# _create_sentencepiece_vocab()
_interactive_test()
if __name__ == "__main__":
main()

136
train.py
View File

@@ -5,13 +5,11 @@ import math
from numpy import finfo
import torch
from distributed import DistributedDataParallel
from distributed import apply_gradient_allreduce
import torch.distributed as dist
from torch.utils.data.distributed import DistributedSampler
from torch.nn import DataParallel
from torch.utils.data import DataLoader
from fp16_optimizer import FP16_Optimizer
from model import Tacotron2
from data_utils import TextMelLoader, TextMelCollate
from loss_function import Tacotron2Loss
@@ -19,30 +17,22 @@ from logger import Tacotron2Logger
from hparams import create_hparams
def batchnorm_to_float(module):
"""Converts batch norm modules to FP32"""
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
module.float()
for child in module.children():
batchnorm_to_float(child)
return module
def reduce_tensor(tensor, num_gpus):
def reduce_tensor(tensor, n_gpus):
rt = tensor.clone()
torch.distributed.all_reduce(rt, op=torch.distributed.reduce_op.SUM)
rt /= num_gpus
dist.all_reduce(rt, op=dist.reduce_op.SUM)
rt /= n_gpus
return rt
def init_distributed(hparams, n_gpus, rank, group_name):
assert torch.cuda.is_available(), "Distributed mode requires CUDA."
print("Initializing distributed")
print("Initializing Distributed")
# Set cuda device so everything is done on the right GPU.
torch.cuda.set_device(rank % torch.cuda.device_count())
# Initialize distributed communication
torch.distributed.init_process_group(
dist.init_process_group(
backend=hparams.dist_backend, init_method=hparams.dist_url,
world_size=n_gpus, rank=rank, group_name=group_name)
@@ -55,10 +45,14 @@ def prepare_dataloaders(hparams):
valset = TextMelLoader(hparams.validation_files, hparams)
collate_fn = TextMelCollate(hparams.n_frames_per_step)
train_sampler = DistributedSampler(trainset) \
if hparams.distributed_run else None
if hparams.distributed_run:
train_sampler = DistributedSampler(trainset)
shuffle = False
else:
train_sampler = None
shuffle = True
train_loader = DataLoader(trainset, num_workers=1, shuffle=False,
train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle,
sampler=train_sampler,
batch_size=hparams.batch_size, pin_memory=False,
drop_last=True, collate_fn=collate_fn)
@@ -79,22 +73,26 @@ def prepare_directories_and_logger(output_directory, log_directory, rank):
def load_model(hparams):
model = Tacotron2(hparams).cuda()
if hparams.fp16_run:
model = batchnorm_to_float(model.half())
model.decoder.attention_layer.score_mask_value = float(finfo('float16').min)
model.decoder.attention_layer.score_mask_value = finfo('float16').min
if hparams.distributed_run:
model = DistributedDataParallel(model)
elif torch.cuda.device_count() > 1:
model = DataParallel(model)
model = apply_gradient_allreduce(model)
return model
def warm_start_model(checkpoint_path, model):
def warm_start_model(checkpoint_path, model, ignore_layers):
assert os.path.isfile(checkpoint_path)
print("Warm starting model from checkpoint '{}'".format(checkpoint_path))
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
model.load_state_dict(checkpoint_dict['state_dict'])
model_dict = checkpoint_dict['state_dict']
if len(ignore_layers) > 0:
model_dict = {k: v for k, v in model_dict.items()
if k not in ignore_layers}
dummy_dict = model.state_dict()
dummy_dict.update(model_dict)
model_dict = dummy_dict
model.load_state_dict(model_dict)
return model
@@ -131,22 +129,21 @@ def validate(model, criterion, valset, iteration, batch_size, n_gpus,
pin_memory=False, collate_fn=collate_fn)
val_loss = 0.0
if distributed_run or torch.cuda.device_count() > 1:
batch_parser = model.module.parse_batch
else:
batch_parser = model.parse_batch
for i, batch in enumerate(val_loader):
x, y = batch_parser(batch)
x, y = model.parse_batch(batch)
y_pred = model(x)
loss = criterion(y_pred, y)
reduced_val_loss = reduce_tensor(loss.data, n_gpus)[0] \
if distributed_run else loss.data[0]
if distributed_run:
reduced_val_loss = reduce_tensor(loss.data, n_gpus).item()
else:
reduced_val_loss = loss.item()
val_loss += reduced_val_loss
val_loss = val_loss / (i + 1)
model.train()
return val_loss
if rank == 0:
print("Validation loss {}: {:9f} ".format(iteration, reduced_val_loss))
logger.log_validation(reduced_val_loss, model, y, y_pred, iteration)
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
@@ -172,9 +169,14 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
learning_rate = hparams.learning_rate
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
weight_decay=hparams.weight_decay)
if hparams.fp16_run:
optimizer = FP16_Optimizer(
optimizer, dynamic_loss_scale=hparams.dynamic_loss_scaling)
from apex import amp
model, optimizer = amp.initialize(
model, optimizer, opt_level='O2')
if hparams.distributed_run:
model = apply_gradient_allreduce(model)
criterion = Tacotron2Loss()
@@ -188,18 +190,18 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
epoch_offset = 0
if checkpoint_path is not None:
if warm_start:
model = warm_start_model(checkpoint_path, model)
model = warm_start_model(
checkpoint_path, model, hparams.ignore_layers)
else:
model, optimizer, learning_rate, iteration = load_checkpoint(
model, optimizer, _learning_rate, iteration = load_checkpoint(
checkpoint_path, model, optimizer)
if hparams.use_saved_learning_rate:
learning_rate = _learning_rate
iteration += 1 # next iteration is iteration + 1
epoch_offset = max(0, int(iteration / len(train_loader)))
model.train()
if hparams.distributed_run or torch.cuda.device_count() > 1:
batch_parser = model.module.parse_batch
else:
batch_parser = model.parse_batch
is_overflow = False
# ================ MAIN TRAINNIG LOOP! ===================
for epoch in range(epoch_offset, hparams.epochs):
print("Epoch: {}".format(epoch))
@@ -209,42 +211,42 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
param_group['lr'] = learning_rate
model.zero_grad()
x, y = batch_parser(batch)
x, y = model.parse_batch(batch)
y_pred = model(x)
loss = criterion(y_pred, y)
reduced_loss = reduce_tensor(loss.data, n_gpus)[0] \
if hparams.distributed_run else loss.data[0]
loss = criterion(y_pred, y)
if hparams.distributed_run:
reduced_loss = reduce_tensor(loss.data, n_gpus).item()
else:
reduced_loss = loss.item()
if hparams.fp16_run:
optimizer.backward(loss)
grad_norm = optimizer.clip_fp32_grads(hparams.grad_clip_thresh)
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
grad_norm = torch.nn.utils.clip_grad_norm(
if hparams.fp16_run:
grad_norm = torch.nn.utils.clip_grad_norm_(
amp.master_params(optimizer), hparams.grad_clip_thresh)
is_overflow = math.isnan(grad_norm)
else:
grad_norm = torch.nn.utils.clip_grad_norm_(
model.parameters(), hparams.grad_clip_thresh)
optimizer.step()
overflow = optimizer.overflow if hparams.fp16_run else False
if not overflow and not math.isnan(reduced_loss) and rank == 0:
if not is_overflow and rank == 0:
duration = time.perf_counter() - start
print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
iteration, reduced_loss, grad_norm, duration))
logger.log_training(
reduced_loss, grad_norm, learning_rate, duration, iteration)
if not overflow and (iteration % hparams.iters_per_checkpoint == 0):
reduced_val_loss = validate(
model, criterion, valset, iteration, hparams.batch_size,
n_gpus, collate_fn, logger, hparams.distributed_run, rank)
if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0):
validate(model, criterion, valset, iteration,
hparams.batch_size, n_gpus, collate_fn, logger,
hparams.distributed_run, rank)
if rank == 0:
print("Validation loss {}: {:9f} ".format(
iteration, reduced_val_loss))
logger.log_validation(
reduced_val_loss, model, y, y_pred, iteration)
checkpoint_path = os.path.join(
output_directory, "checkpoint_{}".format(iteration))
save_checkpoint(model, optimizer, learning_rate, iteration,
@@ -262,7 +264,7 @@ if __name__ == '__main__':
parser.add_argument('-c', '--checkpoint_path', type=str, default=None,
required=False, help='checkpoint path')
parser.add_argument('--warm_start', action='store_true',
help='load the model only (warm start)')
help='load model weights only, ignore specified layers')
parser.add_argument('--n_gpus', type=int, default=1,
required=False, help='number of gpus')
parser.add_argument('--rank', type=int, default=0,
@@ -279,7 +281,7 @@ if __name__ == '__main__':
torch.backends.cudnn.benchmark = hparams.cudnn_benchmark
print("FP16 Run:", hparams.fp16_run)
print("Dynamic Loss Scaling", hparams.dynamic_loss_scaling)
print("Dynamic Loss Scaling:", hparams.dynamic_loss_scaling)
print("Distributed Run:", hparams.distributed_run)
print("cuDNN Enabled:", hparams.cudnn_enabled)
print("cuDNN Benchmark:", hparams.cudnn_benchmark)

View File

@@ -4,29 +4,26 @@ import torch
def get_mask_from_lengths(lengths):
max_len = torch.max(lengths)
ids = torch.arange(0, max_len).long().cuda()
max_len = torch.max(lengths).item()
ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len))
mask = (ids < lengths.unsqueeze(1)).byte()
return mask
def load_wav_to_torch(full_path, sr):
def load_wav_to_torch(full_path):
sampling_rate, data = read(full_path)
assert sr == sampling_rate, "{} SR doesn't match {} on path {}".format(
sr, sampling_rate, full_path)
return torch.FloatTensor(data.astype(np.float32))
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
def load_filepaths_and_text(filename, sort_by_length, split="|"):
def load_filepaths_and_text(filename, split="|"):
with open(filename, encoding='utf-8') as f:
filepaths_and_text = [line.strip().split(split) for line in f]
if sort_by_length:
filepaths_and_text.sort(key=lambda x: len(x[1]))
return filepaths_and_text
def to_gpu(x):
x = x.contiguous().cuda(async=True)
x = x.contiguous()
if torch.cuda.is_available():
x = x.cuda(non_blocking=True)
return torch.autograd.Variable(x)

1
waveglow Submodule

Submodule waveglow added at 4b1001fa33