mirror of
https://github.com/malarinv/tacotron2
synced 2026-03-08 09:42:34 +00:00
Compare commits
69 Commits
attention_
...
experiment
| Author | SHA1 | Date | |
|---|---|---|---|
| 6d3788d858 | |||
| f449105b79 | |||
| 4d5001bdf0 | |||
|
|
131c1465b4 | ||
|
|
d5321ff0ca | ||
|
|
c76ac3b211 | ||
|
|
e3d2d0a5ef | ||
|
|
a992aea070 | ||
|
|
eb2a171690 | ||
|
|
821bfeba5d | ||
|
|
d6670c8ed7 | ||
|
|
0274619e45 | ||
|
|
bb20035586 | ||
|
|
1480f82908 | ||
|
|
087c86755f | ||
|
|
ece7d3f568 | ||
|
|
f37998c59d | ||
|
|
bff304f432 | ||
|
|
3869781877 | ||
|
|
bb67613493 | ||
|
|
af1f71a975 | ||
|
|
fc0d34cfce | ||
|
|
f2c94d94fd | ||
|
|
df4a466af2 | ||
|
|
825ffa47d1 | ||
|
|
4d7b04120a | ||
|
|
6e430556bd | ||
|
|
3973b3e495 | ||
|
|
52a30bb7b6 | ||
|
|
0ad65cc053 | ||
|
|
8300844fa7 | ||
|
|
f06063f746 | ||
|
|
3045ba125b | ||
|
|
4c4aca3662 | ||
|
|
05dd8f91d2 | ||
|
|
5d66c3deab | ||
|
|
f02704f338 | ||
|
|
ba8cf36198 | ||
|
|
b5e0a93946 | ||
|
|
58b0ec61bd | ||
|
|
1ad939df1a | ||
|
|
32b9a135d0 | ||
|
|
ce29e13959 | ||
|
|
1ea6ed5861 | ||
|
|
cdfde985e5 | ||
|
|
e314bb4cd0 | ||
|
|
4af4ccb135 | ||
|
|
1ec0e5e8cd | ||
|
|
249afd8043 | ||
|
|
1b243d5d5a | ||
|
|
d0aa9e7d32 | ||
|
|
1683a57ae5 | ||
|
|
fc0cf6a89a | ||
|
|
8de38495be | ||
|
|
7eb045206c | ||
|
|
c67005f1be | ||
|
|
cb3794796f | ||
|
|
a8de973923 | ||
|
|
a0ae2da05f | ||
|
|
34066ac4fc | ||
|
|
12ab5ba89c | ||
|
|
d10da5f41e | ||
|
|
5f0ea06c41 | ||
|
|
22bcff1155 | ||
|
|
2e934478a4 | ||
|
|
8ae231b10b | ||
|
|
4d733d1bdd | ||
|
|
b4e52404ab | ||
|
|
064629c9bc |
147
.gitignore
vendored
Normal file
147
.gitignore
vendored
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
.env
|
||||||
|
env/
|
||||||
|
jupyter.json
|
||||||
|
run*/
|
||||||
|
filelists/
|
||||||
|
# Created by https://www.gitignore.io/api/macos
|
||||||
|
# Edit at https://www.gitignore.io/?templates=macos
|
||||||
|
|
||||||
|
### macOS ###
|
||||||
|
# General
|
||||||
|
.DS_Store
|
||||||
|
.AppleDouble
|
||||||
|
.LSOverride
|
||||||
|
|
||||||
|
# Icon must end with two \r
|
||||||
|
Icon
|
||||||
|
|
||||||
|
# Thumbnails
|
||||||
|
._*
|
||||||
|
|
||||||
|
# Files that might appear in the root of a volume
|
||||||
|
.DocumentRevisions-V100
|
||||||
|
.fseventsd
|
||||||
|
.Spotlight-V100
|
||||||
|
.TemporaryItems
|
||||||
|
.Trashes
|
||||||
|
.VolumeIcon.icns
|
||||||
|
.com.apple.timemachine.donotpresent
|
||||||
|
|
||||||
|
# Directories potentially created on remote AFP share
|
||||||
|
.AppleDB
|
||||||
|
.AppleDesktop
|
||||||
|
Network Trash Folder
|
||||||
|
Temporary Items
|
||||||
|
.apdisk
|
||||||
|
|
||||||
|
# End of https://www.gitignore.io/api/macos
|
||||||
|
|
||||||
|
# Created by https://www.gitignore.io/api/python
|
||||||
|
# Edit at https://www.gitignore.io/?templates=python
|
||||||
|
|
||||||
|
### Python ###
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
pip-wheel-metadata/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
target/
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
.python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# celery beat schedule file
|
||||||
|
celerybeat-schedule
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# Mr Developer
|
||||||
|
.mr.developer.cfg
|
||||||
|
.project
|
||||||
|
.pydevproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# End of https://www.gitignore.io/api/python
|
||||||
4
.gitmodules
vendored
Normal file
4
.gitmodules
vendored
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
[submodule "waveglow"]
|
||||||
|
path = waveglow
|
||||||
|
url = https://github.com/NVIDIA/waveglow
|
||||||
|
branch = master
|
||||||
12
Dockerfile
12
Dockerfile
@@ -1,4 +1,10 @@
|
|||||||
FROM pytorch/pytorch:0.4_cuda9_cudnn7
|
FROM pytorch/pytorch:nightly-devel-cuda10.0-cudnn7
|
||||||
|
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
|
||||||
|
|
||||||
RUN pip install numpy scipy matplotlib librosa==0.6.0 tensorflow tensorboardX
|
RUN apt-get update -y
|
||||||
inflect==0.2.5 Unidecode==1.0.22
|
|
||||||
|
RUN pip install numpy scipy matplotlib librosa==0.6.0 tensorflow tensorboardX inflect==0.2.5 Unidecode==1.0.22 jupyter
|
||||||
|
|
||||||
|
ADD apex /apex/
|
||||||
|
WORKDIR /apex/
|
||||||
|
RUN pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
|
||||||
|
|||||||
6
Makefile
Normal file
6
Makefile
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
.PHONY: clean clean-test clean-pyc clean-build docs help common.mk
|
||||||
|
.DEFAULT_GOAL := help
|
||||||
|
|
||||||
|
|
||||||
|
notebook:
|
||||||
|
jupyter lab --ip=0.0.0.0 --no-browser --NotebookApp.token='${JUPYTER_TOKEN}'
|
||||||
55
README.md
Normal file → Executable file
55
README.md
Normal file → Executable file
@@ -1,13 +1,15 @@
|
|||||||
# Tacotron 2 (without wavenet)
|
# Tacotron 2 (without wavenet)
|
||||||
|
|
||||||
Tacotron 2 PyTorch implementation of [Natural TTS Synthesis By Conditioning
|
PyTorch implementation of [Natural TTS Synthesis By Conditioning
|
||||||
Wavenet On Mel Spectrogram Predictions](https://arxiv.org/pdf/1712.05884.pdf).
|
Wavenet On Mel Spectrogram Predictions](https://arxiv.org/pdf/1712.05884.pdf).
|
||||||
|
|
||||||
This implementation includes **distributed** and **fp16** support
|
This implementation includes **distributed** and **automatic mixed precision** support
|
||||||
and uses the [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/).
|
and uses the [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/).
|
||||||
|
|
||||||
Distributed and FP16 support relies on work by Christian Sarofeen and NVIDIA's
|
Distributed and Automatic Mixed Precision support relies on NVIDIA's [Apex] and [AMP].
|
||||||
[Apex Library](https://github.com/nvidia/apex).
|
|
||||||
|
Visit our [website] for audio samples using our published [Tacotron 2] and
|
||||||
|
[WaveGlow] models.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
@@ -19,28 +21,44 @@ Distributed and FP16 support relies on work by Christian Sarofeen and NVIDIA's
|
|||||||
1. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/)
|
1. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/)
|
||||||
2. Clone this repo: `git clone https://github.com/NVIDIA/tacotron2.git`
|
2. Clone this repo: `git clone https://github.com/NVIDIA/tacotron2.git`
|
||||||
3. CD into this repo: `cd tacotron2`
|
3. CD into this repo: `cd tacotron2`
|
||||||
4. Update .wav paths: `sed -i -- 's,DUMMY,ljs_dataset_folder/wavs,g' filelists/*.txt`
|
4. Initialize submodule: `git submodule init; git submodule update`
|
||||||
|
5. Update .wav paths: `sed -i -- 's,DUMMY,ljs_dataset_folder/wavs,g' filelists/*.txt`
|
||||||
- Alternatively, set `load_mel_from_disk=True` in `hparams.py` and update mel-spectrogram paths
|
- Alternatively, set `load_mel_from_disk=True` in `hparams.py` and update mel-spectrogram paths
|
||||||
5. Install [pytorch 0.4](https://github.com/pytorch/pytorch)
|
6. Install [PyTorch 1.0]
|
||||||
6. Install python requirements or build docker image
|
7. Install [Apex]
|
||||||
|
8. Install python requirements or build docker image
|
||||||
- Install python requirements: `pip install -r requirements.txt`
|
- Install python requirements: `pip install -r requirements.txt`
|
||||||
- **OR**
|
|
||||||
- Build docker image: `docker build --tag tacotron2 .`
|
|
||||||
|
|
||||||
## Training
|
## Training
|
||||||
1. `python train.py --output_directory=outdir --log_directory=logdir`
|
1. `python train.py --output_directory=outdir --log_directory=logdir`
|
||||||
2. (OPTIONAL) `tensorboard --logdir=outdir/logdir`
|
2. (OPTIONAL) `tensorboard --logdir=outdir/logdir`
|
||||||
|
|
||||||
## Multi-GPU (distributed) and FP16 Training
|
## Training using a pre-trained model
|
||||||
|
Training using a pre-trained model can lead to faster convergence
|
||||||
|
By default, the dataset dependent text embedding layers are [ignored]
|
||||||
|
|
||||||
|
1. Download our published [Tacotron 2] model
|
||||||
|
2. `python train.py --output_directory=outdir --log_directory=logdir -c tacotron2_statedict.pt --warm_start`
|
||||||
|
|
||||||
|
## Multi-GPU (distributed) and Automatic Mixed Precision Training
|
||||||
1. `python -m multiproc train.py --output_directory=outdir --log_directory=logdir --hparams=distributed_run=True,fp16_run=True`
|
1. `python -m multiproc train.py --output_directory=outdir --log_directory=logdir --hparams=distributed_run=True,fp16_run=True`
|
||||||
|
|
||||||
## Inference
|
## Inference demo
|
||||||
1. `jupyter notebook --ip=127.0.0.1 --port=31337`
|
1. Download our published [Tacotron 2] model
|
||||||
2. load inference.ipynb
|
2. Download our published [WaveGlow] model
|
||||||
|
3. `jupyter notebook --ip=127.0.0.1 --port=31337`
|
||||||
|
4. Load inference.ipynb
|
||||||
|
|
||||||
|
N.b. When performing Mel-Spectrogram to Audio synthesis, make sure Tacotron 2
|
||||||
|
and the Mel decoder were trained on the same mel-spectrogram representation.
|
||||||
|
|
||||||
|
|
||||||
## Related repos
|
## Related repos
|
||||||
[nv-wavenet](https://github.com/NVIDIA/nv-wavenet/): Faster than real-time
|
[WaveGlow](https://github.com/NVIDIA/WaveGlow) Faster than real time Flow-based
|
||||||
wavenet inference
|
Generative Network for Speech Synthesis
|
||||||
|
|
||||||
|
[nv-wavenet](https://github.com/NVIDIA/nv-wavenet/) Faster than real time
|
||||||
|
WaveNet.
|
||||||
|
|
||||||
## Acknowledgements
|
## Acknowledgements
|
||||||
This implementation uses code from the following repos: [Keith
|
This implementation uses code from the following repos: [Keith
|
||||||
@@ -54,3 +72,10 @@ We are thankful to the Tacotron 2 paper authors, specially Jonathan Shen, Yuxuan
|
|||||||
Wang and Zongheng Yang.
|
Wang and Zongheng Yang.
|
||||||
|
|
||||||
|
|
||||||
|
[WaveGlow]: https://drive.google.com/file/d/1WsibBTsuRg_SF2Z6L6NFRTT-NjEy1oTx/view?usp=sharing
|
||||||
|
[Tacotron 2]: https://drive.google.com/file/d/1c5ZTuT7J08wLUoVZ2KkUs_VdZuJ86ZqA/view?usp=sharing
|
||||||
|
[pytorch 1.0]: https://github.com/pytorch/pytorch#installation
|
||||||
|
[website]: https://nv-adlr.github.io/WaveGlow
|
||||||
|
[ignored]: https://github.com/NVIDIA/tacotron2/blob/master/hparams.py#L22
|
||||||
|
[Apex]: https://github.com/nvidia/apex
|
||||||
|
[AMP]: https://github.com/NVIDIA/apex/tree/master/apex/amp
|
||||||
|
|||||||
27
common.mk
Normal file
27
common.mk
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
define BROWSER_PYSCRIPT
|
||||||
|
import os, webbrowser, sys
|
||||||
|
|
||||||
|
try:
|
||||||
|
from urllib import pathname2url
|
||||||
|
except:
|
||||||
|
from urllib.request import pathname2url
|
||||||
|
|
||||||
|
webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
|
||||||
|
endef
|
||||||
|
export BROWSER_PYSCRIPT
|
||||||
|
|
||||||
|
define PRINT_HELP_PYSCRIPT
|
||||||
|
import re, sys
|
||||||
|
|
||||||
|
for line in sys.stdin:
|
||||||
|
match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
|
||||||
|
if match:
|
||||||
|
target, help = match.groups()
|
||||||
|
print("%-20s %s" % (target, help))
|
||||||
|
endef
|
||||||
|
export PRINT_HELP_PYSCRIPT
|
||||||
|
|
||||||
|
BROWSER := python -c "$$BROWSER_PYSCRIPT"
|
||||||
|
|
||||||
|
help: ## make TARGET forwards the TARGET to sub packages
|
||||||
|
@cat $(MAKEFILE_LIST) | python -c "$$PRINT_HELP_PYSCRIPT"
|
||||||
@@ -5,8 +5,8 @@ import torch.utils.data
|
|||||||
|
|
||||||
import layers
|
import layers
|
||||||
from utils import load_wav_to_torch, load_filepaths_and_text
|
from utils import load_wav_to_torch, load_filepaths_and_text
|
||||||
from text import text_to_sequence
|
# from text import text_to_sequence
|
||||||
|
from text_codec import text_to_sequence
|
||||||
|
|
||||||
class TextMelLoader(torch.utils.data.Dataset):
|
class TextMelLoader(torch.utils.data.Dataset):
|
||||||
"""
|
"""
|
||||||
@@ -14,9 +14,8 @@ class TextMelLoader(torch.utils.data.Dataset):
|
|||||||
2) normalizes text and converts them to sequences of one-hot vectors
|
2) normalizes text and converts them to sequences of one-hot vectors
|
||||||
3) computes mel-spectrograms from audio files.
|
3) computes mel-spectrograms from audio files.
|
||||||
"""
|
"""
|
||||||
def __init__(self, audiopaths_and_text, hparams, shuffle=True):
|
def __init__(self, audiopaths_and_text, hparams):
|
||||||
self.audiopaths_and_text = load_filepaths_and_text(
|
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
|
||||||
audiopaths_and_text, hparams.sort_by_length)
|
|
||||||
self.text_cleaners = hparams.text_cleaners
|
self.text_cleaners = hparams.text_cleaners
|
||||||
self.max_wav_value = hparams.max_wav_value
|
self.max_wav_value = hparams.max_wav_value
|
||||||
self.sampling_rate = hparams.sampling_rate
|
self.sampling_rate = hparams.sampling_rate
|
||||||
@@ -26,8 +25,7 @@ class TextMelLoader(torch.utils.data.Dataset):
|
|||||||
hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
|
hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
|
||||||
hparams.mel_fmax)
|
hparams.mel_fmax)
|
||||||
random.seed(1234)
|
random.seed(1234)
|
||||||
if shuffle:
|
random.shuffle(self.audiopaths_and_text)
|
||||||
random.shuffle(self.audiopaths_and_text)
|
|
||||||
|
|
||||||
def get_mel_text_pair(self, audiopath_and_text):
|
def get_mel_text_pair(self, audiopath_and_text):
|
||||||
# separate filename and text
|
# separate filename and text
|
||||||
@@ -38,7 +36,10 @@ class TextMelLoader(torch.utils.data.Dataset):
|
|||||||
|
|
||||||
def get_mel(self, filename):
|
def get_mel(self, filename):
|
||||||
if not self.load_mel_from_disk:
|
if not self.load_mel_from_disk:
|
||||||
audio = load_wav_to_torch(filename, self.sampling_rate)
|
audio, sampling_rate = load_wav_to_torch(filename)
|
||||||
|
if sampling_rate != self.stft.sampling_rate:
|
||||||
|
raise ValueError("{} {} SR doesn't match target {} SR".format(
|
||||||
|
sampling_rate, self.stft.sampling_rate))
|
||||||
audio_norm = audio / self.max_wav_value
|
audio_norm = audio / self.max_wav_value
|
||||||
audio_norm = audio_norm.unsqueeze(0)
|
audio_norm = audio_norm.unsqueeze(0)
|
||||||
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
|
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
|
||||||
@@ -87,9 +88,9 @@ class TextMelCollate():
|
|||||||
text = batch[ids_sorted_decreasing[i]][0]
|
text = batch[ids_sorted_decreasing[i]][0]
|
||||||
text_padded[i, :text.size(0)] = text
|
text_padded[i, :text.size(0)] = text
|
||||||
|
|
||||||
# Right zero-pad mel-spec with extra single zero vector to mark the end
|
# Right zero-pad mel-spec
|
||||||
num_mels = batch[0][1].size(0)
|
num_mels = batch[0][1].size(0)
|
||||||
max_target_len = max([x[1].size(1) for x in batch]) + 1
|
max_target_len = max([x[1].size(1) for x in batch])
|
||||||
if max_target_len % self.n_frames_per_step != 0:
|
if max_target_len % self.n_frames_per_step != 0:
|
||||||
max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
|
max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
|
||||||
assert max_target_len % self.n_frames_per_step == 0
|
assert max_target_len % self.n_frames_per_step == 0
|
||||||
@@ -103,7 +104,7 @@ class TextMelCollate():
|
|||||||
for i in range(len(ids_sorted_decreasing)):
|
for i in range(len(ids_sorted_decreasing)):
|
||||||
mel = batch[ids_sorted_decreasing[i]][1]
|
mel = batch[ids_sorted_decreasing[i]][1]
|
||||||
mel_padded[i, :, :mel.size(1)] = mel
|
mel_padded[i, :, :mel.size(1)] = mel
|
||||||
gate_padded[i, mel.size(1):] = 1
|
gate_padded[i, mel.size(1)-1:] = 1
|
||||||
output_lengths[i] = mel.size(1)
|
output_lengths[i] = mel.size(1)
|
||||||
|
|
||||||
return text_padded, input_lengths, mel_padded, gate_padded, \
|
return text_padded, input_lengths, mel_padded, gate_padded, \
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from torch.nn.modules import Module
|
from torch.nn.modules import Module
|
||||||
|
from torch.autograd import Variable
|
||||||
|
|
||||||
def _flatten_dense_tensors(tensors):
|
def _flatten_dense_tensors(tensors):
|
||||||
"""Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
|
"""Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
|
||||||
@@ -118,3 +119,55 @@ class DistributedDataParallel(Module):
|
|||||||
super(DistributedDataParallel, self).train(mode)
|
super(DistributedDataParallel, self).train(mode)
|
||||||
self.module.train(mode)
|
self.module.train(mode)
|
||||||
'''
|
'''
|
||||||
|
'''
|
||||||
|
Modifies existing model to do gradient allreduce, but doesn't change class
|
||||||
|
so you don't need "module"
|
||||||
|
'''
|
||||||
|
def apply_gradient_allreduce(module):
|
||||||
|
if not hasattr(dist, '_backend'):
|
||||||
|
module.warn_on_half = True
|
||||||
|
else:
|
||||||
|
module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
|
||||||
|
|
||||||
|
for p in module.state_dict().values():
|
||||||
|
if not torch.is_tensor(p):
|
||||||
|
continue
|
||||||
|
dist.broadcast(p, 0)
|
||||||
|
|
||||||
|
def allreduce_params():
|
||||||
|
if(module.needs_reduction):
|
||||||
|
module.needs_reduction = False
|
||||||
|
buckets = {}
|
||||||
|
for param in module.parameters():
|
||||||
|
if param.requires_grad and param.grad is not None:
|
||||||
|
tp = param.data.dtype
|
||||||
|
if tp not in buckets:
|
||||||
|
buckets[tp] = []
|
||||||
|
buckets[tp].append(param)
|
||||||
|
if module.warn_on_half:
|
||||||
|
if torch.cuda.HalfTensor in buckets:
|
||||||
|
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
|
||||||
|
" It is recommended to use the NCCL backend in this case. This currently requires" +
|
||||||
|
"PyTorch built from top of tree master.")
|
||||||
|
module.warn_on_half = False
|
||||||
|
|
||||||
|
for tp in buckets:
|
||||||
|
bucket = buckets[tp]
|
||||||
|
grads = [param.grad.data for param in bucket]
|
||||||
|
coalesced = _flatten_dense_tensors(grads)
|
||||||
|
dist.all_reduce(coalesced)
|
||||||
|
coalesced /= dist.get_world_size()
|
||||||
|
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
|
||||||
|
buf.copy_(synced)
|
||||||
|
|
||||||
|
for param in list(module.parameters()):
|
||||||
|
def allreduce_hook(*unused):
|
||||||
|
Variable._execution_engine.queue_callback(allreduce_params)
|
||||||
|
if param.requires_grad:
|
||||||
|
param.register_hook(allreduce_hook)
|
||||||
|
|
||||||
|
def set_needs_reduction(self, input, output):
|
||||||
|
self.needs_reduction = True
|
||||||
|
|
||||||
|
module.register_forward_hook(set_needs_reduction)
|
||||||
|
return module
|
||||||
|
|||||||
@@ -1,381 +0,0 @@
|
|||||||
import torch
|
|
||||||
from torch import nn
|
|
||||||
from torch.autograd import Variable
|
|
||||||
from torch.nn.parameter import Parameter
|
|
||||||
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
|
|
||||||
|
|
||||||
from loss_scaler import DynamicLossScaler, LossScaler
|
|
||||||
|
|
||||||
FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
|
|
||||||
HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
|
|
||||||
|
|
||||||
def conversion_helper(val, conversion):
|
|
||||||
"""Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure."""
|
|
||||||
if not isinstance(val, (tuple, list)):
|
|
||||||
return conversion(val)
|
|
||||||
rtn = [conversion_helper(v, conversion) for v in val]
|
|
||||||
if isinstance(val, tuple):
|
|
||||||
rtn = tuple(rtn)
|
|
||||||
return rtn
|
|
||||||
|
|
||||||
def fp32_to_fp16(val):
|
|
||||||
"""Convert fp32 `val` to fp16"""
|
|
||||||
def half_conversion(val):
|
|
||||||
val_typecheck = val
|
|
||||||
if isinstance(val_typecheck, (Parameter, Variable)):
|
|
||||||
val_typecheck = val.data
|
|
||||||
if isinstance(val_typecheck, FLOAT_TYPES):
|
|
||||||
val = val.half()
|
|
||||||
return val
|
|
||||||
return conversion_helper(val, half_conversion)
|
|
||||||
|
|
||||||
def fp16_to_fp32(val):
|
|
||||||
"""Convert fp16 `val` to fp32"""
|
|
||||||
def float_conversion(val):
|
|
||||||
val_typecheck = val
|
|
||||||
if isinstance(val_typecheck, (Parameter, Variable)):
|
|
||||||
val_typecheck = val.data
|
|
||||||
if isinstance(val_typecheck, HALF_TYPES):
|
|
||||||
val = val.float()
|
|
||||||
return val
|
|
||||||
return conversion_helper(val, float_conversion)
|
|
||||||
|
|
||||||
class FP16_Module(nn.Module):
|
|
||||||
def __init__(self, module):
|
|
||||||
super(FP16_Module, self).__init__()
|
|
||||||
self.add_module('module', module.half())
|
|
||||||
|
|
||||||
def forward(self, *inputs, **kwargs):
|
|
||||||
return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs))
|
|
||||||
|
|
||||||
class FP16_Optimizer(object):
|
|
||||||
"""
|
|
||||||
FP16_Optimizer is designed to wrap an existing PyTorch optimizer,
|
|
||||||
and enable an fp16 model to be trained using a master copy of fp32 weights.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
optimizer (torch.optim.optimizer): Existing optimizer containing initialized fp16 parameters. Internally, FP16_Optimizer replaces the passed optimizer's fp16 parameters with new fp32 parameters copied from the original ones. FP16_Optimizer also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy after each step.
|
|
||||||
static_loss_scale (float, optional, default=1.0): Loss scale used internally to scale fp16 gradients computed by the model. Scaled gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so static_loss_scale should not affect learning rate.
|
|
||||||
dynamic_loss_scale (bool, optional, default=False): Use dynamic loss scaling. If True, this will override any static_loss_scale option.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, optimizer, static_loss_scale=1.0, dynamic_loss_scale=False):
|
|
||||||
if not torch.cuda.is_available:
|
|
||||||
raise SystemError('Cannot use fp16 without CUDA')
|
|
||||||
|
|
||||||
self.fp16_param_groups = []
|
|
||||||
self.fp32_param_groups = []
|
|
||||||
self.fp32_flattened_groups = []
|
|
||||||
for i, param_group in enumerate(optimizer.param_groups):
|
|
||||||
print("FP16_Optimizer processing param group {}:".format(i))
|
|
||||||
fp16_params_this_group = []
|
|
||||||
fp32_params_this_group = []
|
|
||||||
for param in param_group['params']:
|
|
||||||
if param.requires_grad:
|
|
||||||
if param.type() == 'torch.cuda.HalfTensor':
|
|
||||||
print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
|
|
||||||
.format(param.size()))
|
|
||||||
fp16_params_this_group.append(param)
|
|
||||||
elif param.type() == 'torch.cuda.FloatTensor':
|
|
||||||
print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
|
|
||||||
.format(param.size()))
|
|
||||||
fp32_params_this_group.append(param)
|
|
||||||
else:
|
|
||||||
raise TypeError("Wrapped parameters must be either "
|
|
||||||
"torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
|
|
||||||
"Received {}".format(param.type()))
|
|
||||||
|
|
||||||
fp32_flattened_this_group = None
|
|
||||||
if len(fp16_params_this_group) > 0:
|
|
||||||
fp32_flattened_this_group = _flatten_dense_tensors(
|
|
||||||
[param.detach().data.clone().float() for param in fp16_params_this_group])
|
|
||||||
|
|
||||||
fp32_flattened_this_group = Variable(fp32_flattened_this_group, requires_grad = True)
|
|
||||||
|
|
||||||
fp32_flattened_this_group.grad = fp32_flattened_this_group.new(
|
|
||||||
*fp32_flattened_this_group.size())
|
|
||||||
|
|
||||||
# python's lovely list concatenation via +
|
|
||||||
if fp32_flattened_this_group is not None:
|
|
||||||
param_group['params'] = [fp32_flattened_this_group] + fp32_params_this_group
|
|
||||||
else:
|
|
||||||
param_group['params'] = fp32_params_this_group
|
|
||||||
|
|
||||||
self.fp16_param_groups.append(fp16_params_this_group)
|
|
||||||
self.fp32_param_groups.append(fp32_params_this_group)
|
|
||||||
self.fp32_flattened_groups.append(fp32_flattened_this_group)
|
|
||||||
|
|
||||||
# print("self.fp32_flattened_groups = ", self.fp32_flattened_groups)
|
|
||||||
# print("self.fp16_param_groups = ", self.fp16_param_groups)
|
|
||||||
|
|
||||||
self.optimizer = optimizer.__class__(optimizer.param_groups)
|
|
||||||
|
|
||||||
# self.optimizer.load_state_dict(optimizer.state_dict())
|
|
||||||
|
|
||||||
self.param_groups = self.optimizer.param_groups
|
|
||||||
|
|
||||||
if dynamic_loss_scale:
|
|
||||||
self.dynamic_loss_scale = True
|
|
||||||
self.loss_scaler = DynamicLossScaler()
|
|
||||||
else:
|
|
||||||
self.dynamic_loss_scale = False
|
|
||||||
self.loss_scaler = LossScaler(static_loss_scale)
|
|
||||||
|
|
||||||
self.overflow = False
|
|
||||||
self.first_closure_call_this_step = True
|
|
||||||
|
|
||||||
def zero_grad(self):
|
|
||||||
"""
|
|
||||||
Zero fp32 and fp16 parameter grads.
|
|
||||||
"""
|
|
||||||
self.optimizer.zero_grad()
|
|
||||||
for fp16_group in self.fp16_param_groups:
|
|
||||||
for param in fp16_group:
|
|
||||||
if param.grad is not None:
|
|
||||||
param.grad.detach_() # This does appear in torch.optim.optimizer.zero_grad(),
|
|
||||||
# but I'm not sure why it's needed.
|
|
||||||
param.grad.zero_()
|
|
||||||
|
|
||||||
def _check_overflow(self):
|
|
||||||
params = []
|
|
||||||
for group in self.fp16_param_groups:
|
|
||||||
for param in group:
|
|
||||||
params.append(param)
|
|
||||||
for group in self.fp32_param_groups:
|
|
||||||
for param in group:
|
|
||||||
params.append(param)
|
|
||||||
self.overflow = self.loss_scaler.has_overflow(params)
|
|
||||||
|
|
||||||
def _update_scale(self, has_overflow=False):
|
|
||||||
self.loss_scaler.update_scale(has_overflow)
|
|
||||||
|
|
||||||
def _copy_grads_fp16_to_fp32(self):
|
|
||||||
for fp32_group, fp16_group in zip(self.fp32_flattened_groups, self.fp16_param_groups):
|
|
||||||
if len(fp16_group) > 0:
|
|
||||||
# This might incur one more deep copy than is necessary.
|
|
||||||
fp32_group.grad.data.copy_(
|
|
||||||
_flatten_dense_tensors([fp16_param.grad.data for fp16_param in fp16_group]))
|
|
||||||
|
|
||||||
def _downscale_fp32(self):
|
|
||||||
if self.loss_scale != 1.0:
|
|
||||||
for param_group in self.optimizer.param_groups:
|
|
||||||
for param in param_group['params']:
|
|
||||||
param.grad.data.mul_(1./self.loss_scale)
|
|
||||||
|
|
||||||
def clip_fp32_grads(self, clip=-1):
|
|
||||||
if not self.overflow:
|
|
||||||
fp32_params = []
|
|
||||||
for param_group in self.optimizer.param_groups:
|
|
||||||
for param in param_group['params']:
|
|
||||||
fp32_params.append(param)
|
|
||||||
if clip > 0:
|
|
||||||
return torch.nn.utils.clip_grad_norm(fp32_params, clip)
|
|
||||||
|
|
||||||
def _copy_params_fp32_to_fp16(self):
|
|
||||||
for fp16_group, fp32_group in zip(self.fp16_param_groups, self.fp32_flattened_groups):
|
|
||||||
if len(fp16_group) > 0:
|
|
||||||
for fp16_param, fp32_data in zip(fp16_group,
|
|
||||||
_unflatten_dense_tensors(fp32_group.data, fp16_group)):
|
|
||||||
fp16_param.data.copy_(fp32_data)
|
|
||||||
|
|
||||||
def state_dict(self):
|
|
||||||
"""
|
|
||||||
Returns a dict containing the current state of this FP16_Optimizer instance.
|
|
||||||
This dict contains attributes of FP16_Optimizer, as well as the state_dict
|
|
||||||
of the contained Pytorch optimizer.
|
|
||||||
|
|
||||||
Untested.
|
|
||||||
"""
|
|
||||||
state_dict = {}
|
|
||||||
state_dict['loss_scaler'] = self.loss_scaler
|
|
||||||
state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
|
|
||||||
state_dict['overflow'] = self.overflow
|
|
||||||
state_dict['first_closure_call_this_step'] = self.first_closure_call_this_step
|
|
||||||
state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
|
|
||||||
return state_dict
|
|
||||||
|
|
||||||
def load_state_dict(self, state_dict):
|
|
||||||
"""
|
|
||||||
Loads a state_dict created by an earlier call to state_dict.
|
|
||||||
|
|
||||||
Untested.
|
|
||||||
"""
|
|
||||||
self.loss_scaler = state_dict['loss_scaler']
|
|
||||||
self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
|
|
||||||
self.overflow = state_dict['overflow']
|
|
||||||
self.first_closure_call_this_step = state_dict['first_closure_call_this_step']
|
|
||||||
self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
|
|
||||||
|
|
||||||
def step(self, closure=None): # could add clip option.
|
|
||||||
"""
|
|
||||||
If no closure is supplied, step should be called after fp16_optimizer_obj.backward(loss).
|
|
||||||
step updates the fp32 master copy of parameters using the optimizer supplied to
|
|
||||||
FP16_Optimizer's constructor, then copies the updated fp32 params into the fp16 params
|
|
||||||
originally referenced by Fp16_Optimizer's constructor, so the user may immediately run
|
|
||||||
another forward pass using their model.
|
|
||||||
|
|
||||||
If a closure is supplied, step may be called without a prior call to self.backward(loss).
|
|
||||||
However, the user should take care that any loss.backward() call within the closure
|
|
||||||
has been replaced by fp16_optimizer_obj.backward(loss).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
closure (optional): Closure that will be supplied to the underlying optimizer originally passed to FP16_Optimizer's constructor. closure should call zero_grad on the FP16_Optimizer object, compute the loss, call .backward(loss), and return the loss.
|
|
||||||
|
|
||||||
Closure example::
|
|
||||||
|
|
||||||
# optimizer is assumed to be an FP16_Optimizer object, previously constructed from an
|
|
||||||
# existing pytorch optimizer.
|
|
||||||
for input, target in dataset:
|
|
||||||
def closure():
|
|
||||||
optimizer.zero_grad()
|
|
||||||
output = model(input)
|
|
||||||
loss = loss_fn(output, target)
|
|
||||||
optimizer.backward(loss)
|
|
||||||
return loss
|
|
||||||
optimizer.step(closure)
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
The only changes that need to be made compared to
|
|
||||||
`ordinary optimizer closures`_ are that "optimizer" itself should be an instance of
|
|
||||||
FP16_Optimizer, and that the call to loss.backward should be replaced by
|
|
||||||
optimizer.backward(loss).
|
|
||||||
|
|
||||||
.. warning::
|
|
||||||
Currently, calling step with a closure is not compatible with dynamic loss scaling.
|
|
||||||
|
|
||||||
.. _`ordinary optimizer closures`:
|
|
||||||
http://pytorch.org/docs/master/optim.html#optimizer-step-closure
|
|
||||||
"""
|
|
||||||
if closure is not None and isinstance(self.loss_scaler, DynamicLossScaler):
|
|
||||||
raise TypeError("Using step with a closure is currently not "
|
|
||||||
"compatible with dynamic loss scaling.")
|
|
||||||
|
|
||||||
scale = self.loss_scaler.loss_scale
|
|
||||||
self._update_scale(self.overflow)
|
|
||||||
|
|
||||||
if self.overflow:
|
|
||||||
print("OVERFLOW! Skipping step. Attempted loss scale: {}".format(scale))
|
|
||||||
return
|
|
||||||
|
|
||||||
if closure is not None:
|
|
||||||
self._step_with_closure(closure)
|
|
||||||
else:
|
|
||||||
self.optimizer.step()
|
|
||||||
|
|
||||||
self._copy_params_fp32_to_fp16()
|
|
||||||
|
|
||||||
return
|
|
||||||
|
|
||||||
def _step_with_closure(self, closure):
|
|
||||||
def wrapped_closure():
|
|
||||||
if self.first_closure_call_this_step:
|
|
||||||
"""
|
|
||||||
We expect that the fp16 params are initially fresh on entering self.step(),
|
|
||||||
so _copy_params_fp32_to_fp16() is unnecessary the first time wrapped_closure()
|
|
||||||
is called within self.optimizer.step().
|
|
||||||
"""
|
|
||||||
self.first_closure_call_this_step = False
|
|
||||||
else:
|
|
||||||
"""
|
|
||||||
If self.optimizer.step() internally calls wrapped_closure more than once,
|
|
||||||
it may update the fp32 params after each call. However, self.optimizer
|
|
||||||
doesn't know about the fp16 params at all. If the fp32 params get updated,
|
|
||||||
we can't rely on self.optimizer to refresh the fp16 params. We need
|
|
||||||
to handle that manually:
|
|
||||||
"""
|
|
||||||
self._copy_params_fp32_to_fp16()
|
|
||||||
|
|
||||||
"""
|
|
||||||
Our API expects the user to give us ownership of the backward() call by
|
|
||||||
replacing all calls to loss.backward() with optimizer.backward(loss).
|
|
||||||
This requirement holds whether or not the call to backward() is made within
|
|
||||||
a closure.
|
|
||||||
If the user is properly calling optimizer.backward(loss) within "closure,"
|
|
||||||
calling closure() here will give the fp32 master params fresh gradients
|
|
||||||
for the optimizer to play with,
|
|
||||||
so all wrapped_closure needs to do is call closure() and return the loss.
|
|
||||||
"""
|
|
||||||
temp_loss = closure()
|
|
||||||
return temp_loss
|
|
||||||
|
|
||||||
self.optimizer.step(wrapped_closure)
|
|
||||||
|
|
||||||
self.first_closure_call_this_step = True
|
|
||||||
|
|
||||||
def backward(self, loss, update_fp32_grads=True):
|
|
||||||
"""
|
|
||||||
fp16_optimizer_obj.backward performs the following conceptual operations:
|
|
||||||
|
|
||||||
fp32_loss = loss.float() (see first Note below)
|
|
||||||
|
|
||||||
scaled_loss = fp32_loss*loss_scale
|
|
||||||
|
|
||||||
scaled_loss.backward(), which accumulates scaled gradients into the .grad attributes of the
|
|
||||||
fp16 model's leaves.
|
|
||||||
|
|
||||||
fp16 grads are then copied to the stored fp32 params' .grad attributes (see second Note).
|
|
||||||
|
|
||||||
Finally, fp32 grads are divided by loss_scale.
|
|
||||||
|
|
||||||
In this way, after fp16_optimizer_obj.backward, the fp32 parameters have fresh gradients,
|
|
||||||
and fp16_optimizer_obj.step may be called.
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
Converting the loss to fp32 before applying the loss scale provides some
|
|
||||||
additional safety against overflow if the user has supplied an fp16 value.
|
|
||||||
However, for maximum overflow safety, the user should
|
|
||||||
compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to
|
|
||||||
fp16_optimizer_obj.backward.
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
The gradients found in an fp16 model's leaves after a call to
|
|
||||||
fp16_optimizer_obj.backward should not be regarded as valid in general,
|
|
||||||
because it's possible
|
|
||||||
they have been scaled (and in the case of dynamic loss scaling,
|
|
||||||
the scale factor may silently change over time).
|
|
||||||
If the user wants to inspect gradients after a call to fp16_optimizer_obj.backward,
|
|
||||||
he/she should query the .grad attribute of FP16_Optimizer's stored fp32 parameters.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
loss: The loss output by the user's model. loss may be either float or half (but see first Note above).
|
|
||||||
update_fp32_grads (bool, optional, default=True): Option to copy fp16 grads to fp32 grads on this call. By setting this to False, the user can delay this copy, which is useful to eliminate redundant fp16->fp32 grad copies if fp16_optimizer_obj.backward is being called on multiple losses in one iteration. If set to False, the user becomes responsible for calling fp16_optimizer_obj.update_fp32_grads before calling fp16_optimizer_obj.step.
|
|
||||||
|
|
||||||
Example::
|
|
||||||
|
|
||||||
# Ordinary operation:
|
|
||||||
optimizer.backward(loss)
|
|
||||||
|
|
||||||
# Naive operation with multiple losses (technically valid, but less efficient):
|
|
||||||
# fp32 grads will be correct after the second call, but
|
|
||||||
# the first call incurs an unnecessary fp16->fp32 grad copy.
|
|
||||||
optimizer.backward(loss1)
|
|
||||||
optimizer.backward(loss2)
|
|
||||||
|
|
||||||
# More efficient way to handle multiple losses:
|
|
||||||
# The fp16->fp32 grad copy is delayed until fp16 grads from all
|
|
||||||
# losses have been accumulated.
|
|
||||||
optimizer.backward(loss1, update_fp32_grads=False)
|
|
||||||
optimizer.backward(loss2, update_fp32_grads=False)
|
|
||||||
optimizer.update_fp32_grads()
|
|
||||||
"""
|
|
||||||
self.loss_scaler.backward(loss.float())
|
|
||||||
if update_fp32_grads:
|
|
||||||
self.update_fp32_grads()
|
|
||||||
|
|
||||||
def update_fp32_grads(self):
|
|
||||||
"""
|
|
||||||
Copy the .grad attribute from stored references to fp16 parameters to
|
|
||||||
the .grad attribute of the master fp32 parameters that are directly
|
|
||||||
updated by the optimizer. :attr:`update_fp32_grads` only needs to be called if
|
|
||||||
fp16_optimizer_obj.backward was called with update_fp32_grads=False.
|
|
||||||
"""
|
|
||||||
if self.dynamic_loss_scale:
|
|
||||||
self._check_overflow()
|
|
||||||
if self.overflow: return
|
|
||||||
self._copy_grads_fp16_to_fp32()
|
|
||||||
self._downscale_fp32()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def loss_scale(self):
|
|
||||||
return self.loss_scaler.loss_scale
|
|
||||||
25
hparams.py
25
hparams.py
@@ -10,15 +10,16 @@ def create_hparams(hparams_string=None, verbose=False):
|
|||||||
# Experiment Parameters #
|
# Experiment Parameters #
|
||||||
################################
|
################################
|
||||||
epochs=500,
|
epochs=500,
|
||||||
iters_per_checkpoint=500,
|
iters_per_checkpoint=1000,
|
||||||
seed=1234,
|
seed=1234,
|
||||||
dynamic_loss_scaling=True,
|
dynamic_loss_scaling=True,
|
||||||
fp16_run=False,
|
fp16_run=False,
|
||||||
distributed_run=False,
|
distributed_run=False,
|
||||||
dist_backend="nccl",
|
dist_backend="nccl",
|
||||||
dist_url="file://distributed.dpt",
|
dist_url="tcp://localhost:54321",
|
||||||
cudnn_enabled=True,
|
cudnn_enabled=True,
|
||||||
cudnn_benchmark=False,
|
cudnn_benchmark=False,
|
||||||
|
ignore_layers=['embedding.weight'],
|
||||||
|
|
||||||
################################
|
################################
|
||||||
# Data Parameters #
|
# Data Parameters #
|
||||||
@@ -27,7 +28,6 @@ def create_hparams(hparams_string=None, verbose=False):
|
|||||||
training_files='filelists/ljs_audio_text_train_filelist.txt',
|
training_files='filelists/ljs_audio_text_train_filelist.txt',
|
||||||
validation_files='filelists/ljs_audio_text_val_filelist.txt',
|
validation_files='filelists/ljs_audio_text_val_filelist.txt',
|
||||||
text_cleaners=['english_cleaners'],
|
text_cleaners=['english_cleaners'],
|
||||||
sort_by_length=False,
|
|
||||||
|
|
||||||
################################
|
################################
|
||||||
# Audio Parameters #
|
# Audio Parameters #
|
||||||
@@ -37,14 +37,14 @@ def create_hparams(hparams_string=None, verbose=False):
|
|||||||
filter_length=1024,
|
filter_length=1024,
|
||||||
hop_length=256,
|
hop_length=256,
|
||||||
win_length=1024,
|
win_length=1024,
|
||||||
n_mel_channels=80,
|
n_mel_channels=40,
|
||||||
mel_fmin=0.0,
|
mel_fmin=0.0,
|
||||||
mel_fmax=None, # if None, half the sampling rate
|
mel_fmax=4000.0,
|
||||||
|
|
||||||
################################
|
################################
|
||||||
# Model Parameters #
|
# Model Parameters #
|
||||||
################################
|
################################
|
||||||
n_symbols=len(symbols),
|
n_symbols=1000,#len(symbols),
|
||||||
symbols_embedding_dim=512,
|
symbols_embedding_dim=512,
|
||||||
|
|
||||||
# Encoder parameters
|
# Encoder parameters
|
||||||
@@ -53,11 +53,13 @@ def create_hparams(hparams_string=None, verbose=False):
|
|||||||
encoder_embedding_dim=512,
|
encoder_embedding_dim=512,
|
||||||
|
|
||||||
# Decoder parameters
|
# Decoder parameters
|
||||||
n_frames_per_step=1,
|
n_frames_per_step=1, # currently only 1 is supported
|
||||||
decoder_rnn_dim=1024,
|
decoder_rnn_dim=1024,
|
||||||
prenet_dim=256,
|
prenet_dim=256,
|
||||||
max_decoder_steps=1000,
|
max_decoder_steps=1000,
|
||||||
gate_threshold=0.6,
|
gate_threshold=0.5,
|
||||||
|
p_attention_dropout=0.1,
|
||||||
|
p_decoder_dropout=0.1,
|
||||||
|
|
||||||
# Attention parameters
|
# Attention parameters
|
||||||
attention_rnn_dim=1024,
|
attention_rnn_dim=1024,
|
||||||
@@ -75,11 +77,12 @@ def create_hparams(hparams_string=None, verbose=False):
|
|||||||
################################
|
################################
|
||||||
# Optimization Hyperparameters #
|
# Optimization Hyperparameters #
|
||||||
################################
|
################################
|
||||||
|
use_saved_learning_rate=False,
|
||||||
learning_rate=1e-3,
|
learning_rate=1e-3,
|
||||||
weight_decay=1e-6,
|
weight_decay=1e-6,
|
||||||
grad_clip_thresh=1,
|
grad_clip_thresh=1.0,
|
||||||
batch_size=48,
|
batch_size=64,
|
||||||
mask_padding=False # set model's padded outputs to padded values
|
mask_padding=True # set model's padded outputs to padded values
|
||||||
)
|
)
|
||||||
|
|
||||||
if hparams_string:
|
if hparams_string:
|
||||||
|
|||||||
180
inference.ipynb
180
inference.ipynb
File diff suppressed because one or more lines are too long
@@ -10,7 +10,7 @@ class LinearNorm(torch.nn.Module):
|
|||||||
super(LinearNorm, self).__init__()
|
super(LinearNorm, self).__init__()
|
||||||
self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
|
self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
|
||||||
|
|
||||||
torch.nn.init.xavier_uniform(
|
torch.nn.init.xavier_uniform_(
|
||||||
self.linear_layer.weight,
|
self.linear_layer.weight,
|
||||||
gain=torch.nn.init.calculate_gain(w_init_gain))
|
gain=torch.nn.init.calculate_gain(w_init_gain))
|
||||||
|
|
||||||
@@ -31,7 +31,7 @@ class ConvNorm(torch.nn.Module):
|
|||||||
padding=padding, dilation=dilation,
|
padding=padding, dilation=dilation,
|
||||||
bias=bias)
|
bias=bias)
|
||||||
|
|
||||||
torch.nn.init.xavier_uniform(
|
torch.nn.init.xavier_uniform_(
|
||||||
self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
|
self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
|
||||||
|
|
||||||
def forward(self, signal):
|
def forward(self, signal):
|
||||||
@@ -42,7 +42,7 @@ class ConvNorm(torch.nn.Module):
|
|||||||
class TacotronSTFT(torch.nn.Module):
|
class TacotronSTFT(torch.nn.Module):
|
||||||
def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
|
def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
|
||||||
n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
|
n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
|
||||||
mel_fmax=None):
|
mel_fmax=8000.0):
|
||||||
super(TacotronSTFT, self).__init__()
|
super(TacotronSTFT, self).__init__()
|
||||||
self.n_mel_channels = n_mel_channels
|
self.n_mel_channels = n_mel_channels
|
||||||
self.sampling_rate = sampling_rate
|
self.sampling_rate = sampling_rate
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import random
|
import random
|
||||||
import torch.nn.functional as F
|
import torch
|
||||||
from tensorboardX import SummaryWriter
|
from tensorboardX import SummaryWriter
|
||||||
from plotting_utils import plot_alignment_to_numpy, plot_spectrogram_to_numpy
|
from plotting_utils import plot_alignment_to_numpy, plot_spectrogram_to_numpy
|
||||||
from plotting_utils import plot_gate_outputs_to_numpy
|
from plotting_utils import plot_gate_outputs_to_numpy
|
||||||
@@ -44,5 +44,5 @@ class Tacotron2Logger(SummaryWriter):
|
|||||||
"gate",
|
"gate",
|
||||||
plot_gate_outputs_to_numpy(
|
plot_gate_outputs_to_numpy(
|
||||||
gate_targets[idx].data.cpu().numpy(),
|
gate_targets[idx].data.cpu().numpy(),
|
||||||
F.sigmoid(gate_outputs[idx]).data.cpu().numpy()),
|
torch.sigmoid(gate_outputs[idx]).data.cpu().numpy()),
|
||||||
iteration)
|
iteration)
|
||||||
|
|||||||
89
model.py
89
model.py
@@ -1,10 +1,10 @@
|
|||||||
|
from math import sqrt
|
||||||
import torch
|
import torch
|
||||||
from torch.autograd import Variable
|
from torch.autograd import Variable
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
from layers import ConvNorm, LinearNorm
|
from layers import ConvNorm, LinearNorm
|
||||||
from utils import to_gpu, get_mask_from_lengths
|
from utils import to_gpu, get_mask_from_lengths
|
||||||
from fp16_optimizer import fp32_to_fp16, fp16_to_fp32
|
|
||||||
|
|
||||||
|
|
||||||
class LocationLayer(nn.Module):
|
class LocationLayer(nn.Module):
|
||||||
@@ -56,7 +56,7 @@ class Attention(nn.Module):
|
|||||||
|
|
||||||
processed_query = self.query_layer(query.unsqueeze(1))
|
processed_query = self.query_layer(query.unsqueeze(1))
|
||||||
processed_attention_weights = self.location_layer(attention_weights_cat)
|
processed_attention_weights = self.location_layer(attention_weights_cat)
|
||||||
energies = self.v(F.tanh(
|
energies = self.v(torch.tanh(
|
||||||
processed_query + processed_attention_weights + processed_memory))
|
processed_query + processed_attention_weights + processed_memory))
|
||||||
|
|
||||||
energies = energies.squeeze(-1)
|
energies = energies.squeeze(-1)
|
||||||
@@ -107,7 +107,6 @@ class Postnet(nn.Module):
|
|||||||
|
|
||||||
def __init__(self, hparams):
|
def __init__(self, hparams):
|
||||||
super(Postnet, self).__init__()
|
super(Postnet, self).__init__()
|
||||||
self.dropout = nn.Dropout(0.5)
|
|
||||||
self.convolutions = nn.ModuleList()
|
self.convolutions = nn.ModuleList()
|
||||||
|
|
||||||
self.convolutions.append(
|
self.convolutions.append(
|
||||||
@@ -141,9 +140,8 @@ class Postnet(nn.Module):
|
|||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
for i in range(len(self.convolutions) - 1):
|
for i in range(len(self.convolutions) - 1):
|
||||||
x = self.dropout(F.tanh(self.convolutions[i](x)))
|
x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training)
|
||||||
|
x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
|
||||||
x = self.dropout(self.convolutions[-1](x))
|
|
||||||
|
|
||||||
return x
|
return x
|
||||||
|
|
||||||
@@ -155,7 +153,6 @@ class Encoder(nn.Module):
|
|||||||
"""
|
"""
|
||||||
def __init__(self, hparams):
|
def __init__(self, hparams):
|
||||||
super(Encoder, self).__init__()
|
super(Encoder, self).__init__()
|
||||||
self.dropout = nn.Dropout(0.5)
|
|
||||||
|
|
||||||
convolutions = []
|
convolutions = []
|
||||||
for _ in range(hparams.encoder_n_convolutions):
|
for _ in range(hparams.encoder_n_convolutions):
|
||||||
@@ -175,7 +172,7 @@ class Encoder(nn.Module):
|
|||||||
|
|
||||||
def forward(self, x, input_lengths):
|
def forward(self, x, input_lengths):
|
||||||
for conv in self.convolutions:
|
for conv in self.convolutions:
|
||||||
x = self.dropout(F.relu(conv(x)))
|
x = F.dropout(F.relu(conv(x)), 0.5, self.training)
|
||||||
|
|
||||||
x = x.transpose(1, 2)
|
x = x.transpose(1, 2)
|
||||||
|
|
||||||
@@ -194,7 +191,7 @@ class Encoder(nn.Module):
|
|||||||
|
|
||||||
def inference(self, x):
|
def inference(self, x):
|
||||||
for conv in self.convolutions:
|
for conv in self.convolutions:
|
||||||
x = self.dropout(F.relu(conv(x)))
|
x = F.dropout(F.relu(conv(x)), 0.5, self.training)
|
||||||
|
|
||||||
x = x.transpose(1, 2)
|
x = x.transpose(1, 2)
|
||||||
|
|
||||||
@@ -215,13 +212,15 @@ class Decoder(nn.Module):
|
|||||||
self.prenet_dim = hparams.prenet_dim
|
self.prenet_dim = hparams.prenet_dim
|
||||||
self.max_decoder_steps = hparams.max_decoder_steps
|
self.max_decoder_steps = hparams.max_decoder_steps
|
||||||
self.gate_threshold = hparams.gate_threshold
|
self.gate_threshold = hparams.gate_threshold
|
||||||
|
self.p_attention_dropout = hparams.p_attention_dropout
|
||||||
|
self.p_decoder_dropout = hparams.p_decoder_dropout
|
||||||
|
|
||||||
self.prenet = Prenet(
|
self.prenet = Prenet(
|
||||||
hparams.n_mel_channels * hparams.n_frames_per_step,
|
hparams.n_mel_channels * hparams.n_frames_per_step,
|
||||||
[hparams.prenet_dim, hparams.prenet_dim])
|
[hparams.prenet_dim, hparams.prenet_dim])
|
||||||
|
|
||||||
self.attention_rnn = nn.LSTMCell(
|
self.attention_rnn = nn.LSTMCell(
|
||||||
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
|
hparams.prenet_dim + hparams.encoder_embedding_dim,
|
||||||
hparams.attention_rnn_dim)
|
hparams.attention_rnn_dim)
|
||||||
|
|
||||||
self.attention_layer = Attention(
|
self.attention_layer = Attention(
|
||||||
@@ -230,12 +229,12 @@ class Decoder(nn.Module):
|
|||||||
hparams.attention_location_kernel_size)
|
hparams.attention_location_kernel_size)
|
||||||
|
|
||||||
self.decoder_rnn = nn.LSTMCell(
|
self.decoder_rnn = nn.LSTMCell(
|
||||||
hparams.prenet_dim + hparams.encoder_embedding_dim,
|
hparams.attention_rnn_dim + hparams.encoder_embedding_dim,
|
||||||
hparams.decoder_rnn_dim, 1)
|
hparams.decoder_rnn_dim, 1)
|
||||||
|
|
||||||
self.linear_projection = LinearNorm(
|
self.linear_projection = LinearNorm(
|
||||||
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
|
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
|
||||||
hparams.n_mel_channels*hparams.n_frames_per_step)
|
hparams.n_mel_channels * hparams.n_frames_per_step)
|
||||||
|
|
||||||
self.gate_layer = LinearNorm(
|
self.gate_layer = LinearNorm(
|
||||||
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1,
|
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1,
|
||||||
@@ -350,10 +349,11 @@ class Decoder(nn.Module):
|
|||||||
gate_output: gate output energies
|
gate_output: gate output energies
|
||||||
attention_weights:
|
attention_weights:
|
||||||
"""
|
"""
|
||||||
|
cell_input = torch.cat((decoder_input, self.attention_context), -1)
|
||||||
cell_input = torch.cat((self.decoder_hidden, self.attention_context), -1)
|
|
||||||
self.attention_hidden, self.attention_cell = self.attention_rnn(
|
self.attention_hidden, self.attention_cell = self.attention_rnn(
|
||||||
cell_input, (self.attention_hidden, self.attention_cell))
|
cell_input, (self.attention_hidden, self.attention_cell))
|
||||||
|
self.attention_hidden = F.dropout(
|
||||||
|
self.attention_hidden, self.p_attention_dropout, self.training)
|
||||||
|
|
||||||
attention_weights_cat = torch.cat(
|
attention_weights_cat = torch.cat(
|
||||||
(self.attention_weights.unsqueeze(1),
|
(self.attention_weights.unsqueeze(1),
|
||||||
@@ -363,10 +363,12 @@ class Decoder(nn.Module):
|
|||||||
attention_weights_cat, self.mask)
|
attention_weights_cat, self.mask)
|
||||||
|
|
||||||
self.attention_weights_cum += self.attention_weights
|
self.attention_weights_cum += self.attention_weights
|
||||||
prenet_output = self.prenet(decoder_input)
|
decoder_input = torch.cat(
|
||||||
decoder_input = torch.cat((prenet_output, self.attention_context), -1)
|
(self.attention_hidden, self.attention_context), -1)
|
||||||
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
|
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
|
||||||
decoder_input, (self.decoder_hidden, self.decoder_cell))
|
decoder_input, (self.decoder_hidden, self.decoder_cell))
|
||||||
|
self.decoder_hidden = F.dropout(
|
||||||
|
self.decoder_hidden, self.p_decoder_dropout, self.training)
|
||||||
|
|
||||||
decoder_hidden_attention_context = torch.cat(
|
decoder_hidden_attention_context = torch.cat(
|
||||||
(self.decoder_hidden, self.attention_context), dim=1)
|
(self.decoder_hidden, self.attention_context), dim=1)
|
||||||
@@ -391,22 +393,23 @@ class Decoder(nn.Module):
|
|||||||
alignments: sequence of attention weights from the decoder
|
alignments: sequence of attention weights from the decoder
|
||||||
"""
|
"""
|
||||||
|
|
||||||
decoder_input = self.get_go_frame(memory)
|
decoder_input = self.get_go_frame(memory).unsqueeze(0)
|
||||||
decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
|
decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
|
||||||
|
decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
|
||||||
|
decoder_inputs = self.prenet(decoder_inputs)
|
||||||
|
|
||||||
self.initialize_decoder_states(
|
self.initialize_decoder_states(
|
||||||
memory, mask=~get_mask_from_lengths(memory_lengths))
|
memory, mask=~get_mask_from_lengths(memory_lengths))
|
||||||
|
|
||||||
mel_outputs, gate_outputs, alignments = [], [], []
|
mel_outputs, gate_outputs, alignments = [], [], []
|
||||||
|
while len(mel_outputs) < decoder_inputs.size(0) - 1:
|
||||||
while len(mel_outputs) < decoder_inputs.size(0):
|
decoder_input = decoder_inputs[len(mel_outputs)]
|
||||||
mel_output, gate_output, attention_weights = self.decode(
|
mel_output, gate_output, attention_weights = self.decode(
|
||||||
decoder_input)
|
decoder_input)
|
||||||
mel_outputs += [mel_output]
|
mel_outputs += [mel_output.squeeze(1)]
|
||||||
gate_outputs += [gate_output.squeeze(1)]
|
gate_outputs += [gate_output.squeeze()]
|
||||||
alignments += [attention_weights]
|
alignments += [attention_weights]
|
||||||
|
|
||||||
decoder_input = decoder_inputs[len(mel_outputs) - 1]
|
|
||||||
|
|
||||||
mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
|
mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
|
||||||
mel_outputs, gate_outputs, alignments)
|
mel_outputs, gate_outputs, alignments)
|
||||||
|
|
||||||
@@ -430,13 +433,14 @@ class Decoder(nn.Module):
|
|||||||
|
|
||||||
mel_outputs, gate_outputs, alignments = [], [], []
|
mel_outputs, gate_outputs, alignments = [], [], []
|
||||||
while True:
|
while True:
|
||||||
|
decoder_input = self.prenet(decoder_input)
|
||||||
mel_output, gate_output, alignment = self.decode(decoder_input)
|
mel_output, gate_output, alignment = self.decode(decoder_input)
|
||||||
|
|
||||||
mel_outputs += [mel_output]
|
mel_outputs += [mel_output.squeeze(1)]
|
||||||
gate_outputs += [gate_output.squeeze(1)]
|
gate_outputs += [gate_output]
|
||||||
alignments += [alignment]
|
alignments += [alignment]
|
||||||
|
|
||||||
if F.sigmoid(gate_output.data) > self.gate_threshold:
|
if torch.sigmoid(gate_output.data) > self.gate_threshold:
|
||||||
break
|
break
|
||||||
elif len(mel_outputs) == self.max_decoder_steps:
|
elif len(mel_outputs) == self.max_decoder_steps:
|
||||||
print("Warning! Reached max decoder steps")
|
print("Warning! Reached max decoder steps")
|
||||||
@@ -459,6 +463,9 @@ class Tacotron2(nn.Module):
|
|||||||
self.n_frames_per_step = hparams.n_frames_per_step
|
self.n_frames_per_step = hparams.n_frames_per_step
|
||||||
self.embedding = nn.Embedding(
|
self.embedding = nn.Embedding(
|
||||||
hparams.n_symbols, hparams.symbols_embedding_dim)
|
hparams.n_symbols, hparams.symbols_embedding_dim)
|
||||||
|
std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))
|
||||||
|
val = sqrt(3.0) * std # uniform bounds for std
|
||||||
|
self.embedding.weight.data.uniform_(-val, val)
|
||||||
self.encoder = Encoder(hparams)
|
self.encoder = Encoder(hparams)
|
||||||
self.decoder = Decoder(hparams)
|
self.decoder = Decoder(hparams)
|
||||||
self.postnet = Postnet(hparams)
|
self.postnet = Postnet(hparams)
|
||||||
@@ -467,8 +474,8 @@ class Tacotron2(nn.Module):
|
|||||||
text_padded, input_lengths, mel_padded, gate_padded, \
|
text_padded, input_lengths, mel_padded, gate_padded, \
|
||||||
output_lengths = batch
|
output_lengths = batch
|
||||||
text_padded = to_gpu(text_padded).long()
|
text_padded = to_gpu(text_padded).long()
|
||||||
max_len = int(torch.max(input_lengths.data).numpy())
|
|
||||||
input_lengths = to_gpu(input_lengths).long()
|
input_lengths = to_gpu(input_lengths).long()
|
||||||
|
max_len = torch.max(input_lengths.data).item()
|
||||||
mel_padded = to_gpu(mel_padded).float()
|
mel_padded = to_gpu(mel_padded).float()
|
||||||
gate_padded = to_gpu(gate_padded).float()
|
gate_padded = to_gpu(gate_padded).float()
|
||||||
output_lengths = to_gpu(output_lengths).long()
|
output_lengths = to_gpu(output_lengths).long()
|
||||||
@@ -477,13 +484,9 @@ class Tacotron2(nn.Module):
|
|||||||
(text_padded, input_lengths, mel_padded, max_len, output_lengths),
|
(text_padded, input_lengths, mel_padded, max_len, output_lengths),
|
||||||
(mel_padded, gate_padded))
|
(mel_padded, gate_padded))
|
||||||
|
|
||||||
def parse_input(self, inputs):
|
|
||||||
inputs = fp32_to_fp16(inputs) if self.fp16_run else inputs
|
|
||||||
return inputs
|
|
||||||
|
|
||||||
def parse_output(self, outputs, output_lengths=None):
|
def parse_output(self, outputs, output_lengths=None):
|
||||||
if self.mask_padding and output_lengths is not None:
|
if self.mask_padding and output_lengths is not None:
|
||||||
mask = ~get_mask_from_lengths(output_lengths+1) # +1 <stop> token
|
mask = ~get_mask_from_lengths(output_lengths)
|
||||||
mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
|
mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
|
||||||
mask = mask.permute(1, 0, 2)
|
mask = mask.permute(1, 0, 2)
|
||||||
|
|
||||||
@@ -491,39 +494,27 @@ class Tacotron2(nn.Module):
|
|||||||
outputs[1].data.masked_fill_(mask, 0.0)
|
outputs[1].data.masked_fill_(mask, 0.0)
|
||||||
outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies
|
outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies
|
||||||
|
|
||||||
outputs = fp16_to_fp32(outputs) if self.fp16_run else outputs
|
|
||||||
|
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
def forward(self, inputs):
|
def forward(self, inputs):
|
||||||
inputs, input_lengths, targets, max_len, \
|
text_inputs, text_lengths, mels, max_len, output_lengths = inputs
|
||||||
output_lengths = self.parse_input(inputs)
|
text_lengths, output_lengths = text_lengths.data, output_lengths.data
|
||||||
input_lengths, output_lengths = input_lengths.data, output_lengths.data
|
|
||||||
|
|
||||||
embedded_inputs = self.embedding(inputs).transpose(1, 2)
|
embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
|
||||||
|
|
||||||
encoder_outputs = self.encoder(embedded_inputs, input_lengths)
|
encoder_outputs = self.encoder(embedded_inputs, text_lengths)
|
||||||
|
|
||||||
mel_outputs, gate_outputs, alignments = self.decoder(
|
mel_outputs, gate_outputs, alignments = self.decoder(
|
||||||
encoder_outputs, targets, memory_lengths=input_lengths)
|
encoder_outputs, mels, memory_lengths=text_lengths)
|
||||||
|
|
||||||
mel_outputs_postnet = self.postnet(mel_outputs)
|
mel_outputs_postnet = self.postnet(mel_outputs)
|
||||||
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
|
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
|
||||||
|
|
||||||
# DataParallel expects equal sized inputs/outputs, hence padding
|
|
||||||
if input_lengths is not None:
|
|
||||||
alignments = alignments.unsqueeze(0)
|
|
||||||
alignments = nn.functional.pad(
|
|
||||||
alignments,
|
|
||||||
(0, max_len - alignments.size(3), 0, 0),
|
|
||||||
"constant", 0)
|
|
||||||
alignments = alignments.squeeze()
|
|
||||||
return self.parse_output(
|
return self.parse_output(
|
||||||
[mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
|
[mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
|
||||||
output_lengths)
|
output_lengths)
|
||||||
|
|
||||||
def inference(self, inputs):
|
def inference(self, inputs):
|
||||||
inputs = self.parse_input(inputs)
|
|
||||||
embedded_inputs = self.embedding(inputs).transpose(1, 2)
|
embedded_inputs = self.embedding(inputs).transpose(1, 2)
|
||||||
encoder_outputs = self.encoder.inference(embedded_inputs)
|
encoder_outputs = self.encoder.inference(embedded_inputs)
|
||||||
mel_outputs, gate_outputs, alignments = self.decoder.inference(
|
mel_outputs, gate_outputs, alignments = self.decoder.inference(
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
torch==0.2.0.post3
|
|
||||||
matplotlib==2.1.0
|
matplotlib==2.1.0
|
||||||
tensorflow
|
tensorflow==1.1.0
|
||||||
numpy==1.13.3
|
numpy==1.17.1
|
||||||
inflect==0.2.5
|
inflect==0.2.5
|
||||||
librosa==0.6.0
|
librosa==0.6.0
|
||||||
scipy==1.0.0
|
scipy==1.0.0
|
||||||
tensorboardX==1.1
|
tensorboardX==1.1
|
||||||
Unidecode==1.0.22
|
Unidecode==1.0.22
|
||||||
|
pillow
|
||||||
|
|||||||
3
stft.py
3
stft.py
@@ -61,7 +61,7 @@ class STFT(torch.nn.Module):
|
|||||||
np.linalg.pinv(scale * fourier_basis).T[:, None, :])
|
np.linalg.pinv(scale * fourier_basis).T[:, None, :])
|
||||||
|
|
||||||
if window is not None:
|
if window is not None:
|
||||||
assert(win_length >= filter_length)
|
assert(filter_length >= win_length)
|
||||||
# get window and zero center pad it to filter_length
|
# get window and zero center pad it to filter_length
|
||||||
fft_window = get_window(window, win_length, fftbins=True)
|
fft_window = get_window(window, win_length, fftbins=True)
|
||||||
fft_window = pad_center(fft_window, filter_length)
|
fft_window = pad_center(fft_window, filter_length)
|
||||||
@@ -124,6 +124,7 @@ class STFT(torch.nn.Module):
|
|||||||
np.where(window_sum > tiny(window_sum))[0])
|
np.where(window_sum > tiny(window_sum))[0])
|
||||||
window_sum = torch.autograd.Variable(
|
window_sum = torch.autograd.Variable(
|
||||||
torch.from_numpy(window_sum), requires_grad=False)
|
torch.from_numpy(window_sum), requires_grad=False)
|
||||||
|
window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
|
||||||
inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
|
inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
|
||||||
|
|
||||||
# scale by hop ratio
|
# scale by hop ratio
|
||||||
|
|||||||
@@ -37,8 +37,6 @@ def text_to_sequence(text, cleaner_names):
|
|||||||
sequence += _arpabet_to_sequence(m.group(2))
|
sequence += _arpabet_to_sequence(m.group(2))
|
||||||
text = m.group(3)
|
text = m.group(3)
|
||||||
|
|
||||||
# Append EOS token
|
|
||||||
sequence.append(_symbol_to_id['~'])
|
|
||||||
return sequence
|
return sequence
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -7,11 +7,12 @@ The default is a set of ASCII characters that works well for English or text tha
|
|||||||
from text import cmudict
|
from text import cmudict
|
||||||
|
|
||||||
_pad = '_'
|
_pad = '_'
|
||||||
_eos = '~'
|
_punctuation = '!\'(),.:;? '
|
||||||
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
|
_special = '-'
|
||||||
|
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
|
||||||
|
|
||||||
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
||||||
_arpabet = ['@' + s for s in cmudict.valid_symbols]
|
_arpabet = ['@' + s for s in cmudict.valid_symbols]
|
||||||
|
|
||||||
# Export all symbols:
|
# Export all symbols:
|
||||||
symbols = [_pad, _eos] + list(_characters) + _arpabet
|
symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet
|
||||||
|
|||||||
77
text_codec.py
Normal file
77
text_codec.py
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
from utils import load_filepaths_and_text
|
||||||
|
|
||||||
|
from text import text_to_sequence, sequence_to_text
|
||||||
|
|
||||||
|
from hparams import create_hparams
|
||||||
|
import sentencepiece as spm
|
||||||
|
from text import symbols
|
||||||
|
from bpemb import BPEmb
|
||||||
|
|
||||||
|
|
||||||
|
SPM_CORPUS_FILE = "filelists/text_corpus.txt"
|
||||||
|
SPM_MODEL_PREFIX = "spm"
|
||||||
|
SPM_VOCAB_SIZE = 1000
|
||||||
|
hparams = create_hparams()
|
||||||
|
|
||||||
|
|
||||||
|
def _create_sentencepiece_corpus():
|
||||||
|
def get_text_list(text_file):
|
||||||
|
return [i[1] + "\n" for i in load_filepaths_and_text(text_file)]
|
||||||
|
|
||||||
|
full_text_list = get_text_list(hparams.training_files) + get_text_list(
|
||||||
|
hparams.validation_files
|
||||||
|
)
|
||||||
|
with open(SPM_CORPUS_FILE, "w") as fd:
|
||||||
|
fd.writelines(full_text_list)
|
||||||
|
|
||||||
|
|
||||||
|
def _create_sentencepiece_vocab(vocab_size=SPM_VOCAB_SIZE):
|
||||||
|
train_params = "--input={} --model_type=unigram --character_coverage=1.0 --model_prefix={} --vocab_size={}".format(
|
||||||
|
SPM_CORPUS_FILE, SPM_MODEL_PREFIX, vocab_size
|
||||||
|
)
|
||||||
|
spm.SentencePieceTrainer.Train(train_params)
|
||||||
|
|
||||||
|
|
||||||
|
def _spm_text_codecs():
|
||||||
|
sp = spm.SentencePieceProcessor()
|
||||||
|
sp.Load("{}.model".format(SPM_MODEL_PREFIX))
|
||||||
|
|
||||||
|
def ttseq(text, cleaners):
|
||||||
|
return sp.EncodeAsIds(text)
|
||||||
|
|
||||||
|
def seqtt(sequence):
|
||||||
|
return sp.DecodeIds(sequence)
|
||||||
|
|
||||||
|
return ttseq, seqtt
|
||||||
|
|
||||||
|
|
||||||
|
def _bpemb_text_codecs():
|
||||||
|
bpemb_en = BPEmb(lang="en", dim=50, vs=148)
|
||||||
|
def ttseq(text, cleaners):
|
||||||
|
return bpemb_en.encode_ids(text)
|
||||||
|
|
||||||
|
def seqtt(sequence):
|
||||||
|
return bpemb_en.decode_ids(sequence)
|
||||||
|
|
||||||
|
return ttseq, seqtt
|
||||||
|
|
||||||
|
# text_to_sequence, sequence_to_text = _spm_text_codecs()
|
||||||
|
text_to_sequence, sequence_to_text = _bpemb_text_codecs()
|
||||||
|
|
||||||
|
|
||||||
|
def _interactive_test():
|
||||||
|
prompt = "Hello world; how are you, doing ?"
|
||||||
|
while prompt not in ["q", "quit"]:
|
||||||
|
oup = sequence_to_text(text_to_sequence(prompt, hparams.text_cleaners))
|
||||||
|
print('==> ',oup)
|
||||||
|
prompt = input("> ")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# _create_sentencepiece_corpus()
|
||||||
|
# _create_sentencepiece_vocab()
|
||||||
|
_interactive_test()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
134
train.py
134
train.py
@@ -5,13 +5,11 @@ import math
|
|||||||
from numpy import finfo
|
from numpy import finfo
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from distributed import DistributedDataParallel
|
from distributed import apply_gradient_allreduce
|
||||||
|
import torch.distributed as dist
|
||||||
from torch.utils.data.distributed import DistributedSampler
|
from torch.utils.data.distributed import DistributedSampler
|
||||||
from torch.nn import DataParallel
|
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
from fp16_optimizer import FP16_Optimizer
|
|
||||||
|
|
||||||
from model import Tacotron2
|
from model import Tacotron2
|
||||||
from data_utils import TextMelLoader, TextMelCollate
|
from data_utils import TextMelLoader, TextMelCollate
|
||||||
from loss_function import Tacotron2Loss
|
from loss_function import Tacotron2Loss
|
||||||
@@ -19,30 +17,22 @@ from logger import Tacotron2Logger
|
|||||||
from hparams import create_hparams
|
from hparams import create_hparams
|
||||||
|
|
||||||
|
|
||||||
def batchnorm_to_float(module):
|
def reduce_tensor(tensor, n_gpus):
|
||||||
"""Converts batch norm modules to FP32"""
|
|
||||||
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
|
|
||||||
module.float()
|
|
||||||
for child in module.children():
|
|
||||||
batchnorm_to_float(child)
|
|
||||||
return module
|
|
||||||
|
|
||||||
|
|
||||||
def reduce_tensor(tensor, num_gpus):
|
|
||||||
rt = tensor.clone()
|
rt = tensor.clone()
|
||||||
torch.distributed.all_reduce(rt, op=torch.distributed.reduce_op.SUM)
|
dist.all_reduce(rt, op=dist.reduce_op.SUM)
|
||||||
rt /= num_gpus
|
rt /= n_gpus
|
||||||
return rt
|
return rt
|
||||||
|
|
||||||
|
|
||||||
def init_distributed(hparams, n_gpus, rank, group_name):
|
def init_distributed(hparams, n_gpus, rank, group_name):
|
||||||
assert torch.cuda.is_available(), "Distributed mode requires CUDA."
|
assert torch.cuda.is_available(), "Distributed mode requires CUDA."
|
||||||
print("Initializing distributed")
|
print("Initializing Distributed")
|
||||||
|
|
||||||
# Set cuda device so everything is done on the right GPU.
|
# Set cuda device so everything is done on the right GPU.
|
||||||
torch.cuda.set_device(rank % torch.cuda.device_count())
|
torch.cuda.set_device(rank % torch.cuda.device_count())
|
||||||
|
|
||||||
# Initialize distributed communication
|
# Initialize distributed communication
|
||||||
torch.distributed.init_process_group(
|
dist.init_process_group(
|
||||||
backend=hparams.dist_backend, init_method=hparams.dist_url,
|
backend=hparams.dist_backend, init_method=hparams.dist_url,
|
||||||
world_size=n_gpus, rank=rank, group_name=group_name)
|
world_size=n_gpus, rank=rank, group_name=group_name)
|
||||||
|
|
||||||
@@ -55,10 +45,14 @@ def prepare_dataloaders(hparams):
|
|||||||
valset = TextMelLoader(hparams.validation_files, hparams)
|
valset = TextMelLoader(hparams.validation_files, hparams)
|
||||||
collate_fn = TextMelCollate(hparams.n_frames_per_step)
|
collate_fn = TextMelCollate(hparams.n_frames_per_step)
|
||||||
|
|
||||||
train_sampler = DistributedSampler(trainset) \
|
if hparams.distributed_run:
|
||||||
if hparams.distributed_run else None
|
train_sampler = DistributedSampler(trainset)
|
||||||
|
shuffle = False
|
||||||
|
else:
|
||||||
|
train_sampler = None
|
||||||
|
shuffle = True
|
||||||
|
|
||||||
train_loader = DataLoader(trainset, num_workers=1, shuffle=False,
|
train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle,
|
||||||
sampler=train_sampler,
|
sampler=train_sampler,
|
||||||
batch_size=hparams.batch_size, pin_memory=False,
|
batch_size=hparams.batch_size, pin_memory=False,
|
||||||
drop_last=True, collate_fn=collate_fn)
|
drop_last=True, collate_fn=collate_fn)
|
||||||
@@ -79,22 +73,26 @@ def prepare_directories_and_logger(output_directory, log_directory, rank):
|
|||||||
def load_model(hparams):
|
def load_model(hparams):
|
||||||
model = Tacotron2(hparams).cuda()
|
model = Tacotron2(hparams).cuda()
|
||||||
if hparams.fp16_run:
|
if hparams.fp16_run:
|
||||||
model = batchnorm_to_float(model.half())
|
model.decoder.attention_layer.score_mask_value = finfo('float16').min
|
||||||
model.decoder.attention_layer.score_mask_value = float(finfo('float16').min)
|
|
||||||
|
|
||||||
if hparams.distributed_run:
|
if hparams.distributed_run:
|
||||||
model = DistributedDataParallel(model)
|
model = apply_gradient_allreduce(model)
|
||||||
elif torch.cuda.device_count() > 1:
|
|
||||||
model = DataParallel(model)
|
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def warm_start_model(checkpoint_path, model):
|
def warm_start_model(checkpoint_path, model, ignore_layers):
|
||||||
assert os.path.isfile(checkpoint_path)
|
assert os.path.isfile(checkpoint_path)
|
||||||
print("Warm starting model from checkpoint '{}'".format(checkpoint_path))
|
print("Warm starting model from checkpoint '{}'".format(checkpoint_path))
|
||||||
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
|
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
|
||||||
model.load_state_dict(checkpoint_dict['state_dict'])
|
model_dict = checkpoint_dict['state_dict']
|
||||||
|
if len(ignore_layers) > 0:
|
||||||
|
model_dict = {k: v for k, v in model_dict.items()
|
||||||
|
if k not in ignore_layers}
|
||||||
|
dummy_dict = model.state_dict()
|
||||||
|
dummy_dict.update(model_dict)
|
||||||
|
model_dict = dummy_dict
|
||||||
|
model.load_state_dict(model_dict)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
@@ -131,22 +129,21 @@ def validate(model, criterion, valset, iteration, batch_size, n_gpus,
|
|||||||
pin_memory=False, collate_fn=collate_fn)
|
pin_memory=False, collate_fn=collate_fn)
|
||||||
|
|
||||||
val_loss = 0.0
|
val_loss = 0.0
|
||||||
if distributed_run or torch.cuda.device_count() > 1:
|
|
||||||
batch_parser = model.module.parse_batch
|
|
||||||
else:
|
|
||||||
batch_parser = model.parse_batch
|
|
||||||
|
|
||||||
for i, batch in enumerate(val_loader):
|
for i, batch in enumerate(val_loader):
|
||||||
x, y = batch_parser(batch)
|
x, y = model.parse_batch(batch)
|
||||||
y_pred = model(x)
|
y_pred = model(x)
|
||||||
loss = criterion(y_pred, y)
|
loss = criterion(y_pred, y)
|
||||||
reduced_val_loss = reduce_tensor(loss.data, n_gpus)[0] \
|
if distributed_run:
|
||||||
if distributed_run else loss.data[0]
|
reduced_val_loss = reduce_tensor(loss.data, n_gpus).item()
|
||||||
|
else:
|
||||||
|
reduced_val_loss = loss.item()
|
||||||
val_loss += reduced_val_loss
|
val_loss += reduced_val_loss
|
||||||
val_loss = val_loss / (i + 1)
|
val_loss = val_loss / (i + 1)
|
||||||
|
|
||||||
model.train()
|
model.train()
|
||||||
return val_loss
|
if rank == 0:
|
||||||
|
print("Validation loss {}: {:9f} ".format(iteration, reduced_val_loss))
|
||||||
|
logger.log_validation(reduced_val_loss, model, y, y_pred, iteration)
|
||||||
|
|
||||||
|
|
||||||
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
|
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
|
||||||
@@ -172,9 +169,14 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
|
|||||||
learning_rate = hparams.learning_rate
|
learning_rate = hparams.learning_rate
|
||||||
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
|
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
|
||||||
weight_decay=hparams.weight_decay)
|
weight_decay=hparams.weight_decay)
|
||||||
|
|
||||||
if hparams.fp16_run:
|
if hparams.fp16_run:
|
||||||
optimizer = FP16_Optimizer(
|
from apex import amp
|
||||||
optimizer, dynamic_loss_scale=hparams.dynamic_loss_scaling)
|
model, optimizer = amp.initialize(
|
||||||
|
model, optimizer, opt_level='O2')
|
||||||
|
|
||||||
|
if hparams.distributed_run:
|
||||||
|
model = apply_gradient_allreduce(model)
|
||||||
|
|
||||||
criterion = Tacotron2Loss()
|
criterion = Tacotron2Loss()
|
||||||
|
|
||||||
@@ -188,18 +190,18 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
|
|||||||
epoch_offset = 0
|
epoch_offset = 0
|
||||||
if checkpoint_path is not None:
|
if checkpoint_path is not None:
|
||||||
if warm_start:
|
if warm_start:
|
||||||
model = warm_start_model(checkpoint_path, model)
|
model = warm_start_model(
|
||||||
|
checkpoint_path, model, hparams.ignore_layers)
|
||||||
else:
|
else:
|
||||||
model, optimizer, learning_rate, iteration = load_checkpoint(
|
model, optimizer, _learning_rate, iteration = load_checkpoint(
|
||||||
checkpoint_path, model, optimizer)
|
checkpoint_path, model, optimizer)
|
||||||
|
if hparams.use_saved_learning_rate:
|
||||||
|
learning_rate = _learning_rate
|
||||||
iteration += 1 # next iteration is iteration + 1
|
iteration += 1 # next iteration is iteration + 1
|
||||||
epoch_offset = max(0, int(iteration / len(train_loader)))
|
epoch_offset = max(0, int(iteration / len(train_loader)))
|
||||||
|
|
||||||
model.train()
|
model.train()
|
||||||
if hparams.distributed_run or torch.cuda.device_count() > 1:
|
is_overflow = False
|
||||||
batch_parser = model.module.parse_batch
|
|
||||||
else:
|
|
||||||
batch_parser = model.parse_batch
|
|
||||||
# ================ MAIN TRAINNIG LOOP! ===================
|
# ================ MAIN TRAINNIG LOOP! ===================
|
||||||
for epoch in range(epoch_offset, hparams.epochs):
|
for epoch in range(epoch_offset, hparams.epochs):
|
||||||
print("Epoch: {}".format(epoch))
|
print("Epoch: {}".format(epoch))
|
||||||
@@ -209,42 +211,42 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
|
|||||||
param_group['lr'] = learning_rate
|
param_group['lr'] = learning_rate
|
||||||
|
|
||||||
model.zero_grad()
|
model.zero_grad()
|
||||||
x, y = batch_parser(batch)
|
x, y = model.parse_batch(batch)
|
||||||
y_pred = model(x)
|
y_pred = model(x)
|
||||||
loss = criterion(y_pred, y)
|
|
||||||
reduced_loss = reduce_tensor(loss.data, n_gpus)[0] \
|
|
||||||
if hparams.distributed_run else loss.data[0]
|
|
||||||
|
|
||||||
|
loss = criterion(y_pred, y)
|
||||||
|
if hparams.distributed_run:
|
||||||
|
reduced_loss = reduce_tensor(loss.data, n_gpus).item()
|
||||||
|
else:
|
||||||
|
reduced_loss = loss.item()
|
||||||
if hparams.fp16_run:
|
if hparams.fp16_run:
|
||||||
optimizer.backward(loss)
|
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||||
grad_norm = optimizer.clip_fp32_grads(hparams.grad_clip_thresh)
|
scaled_loss.backward()
|
||||||
else:
|
else:
|
||||||
loss.backward()
|
loss.backward()
|
||||||
grad_norm = torch.nn.utils.clip_grad_norm(
|
|
||||||
|
if hparams.fp16_run:
|
||||||
|
grad_norm = torch.nn.utils.clip_grad_norm_(
|
||||||
|
amp.master_params(optimizer), hparams.grad_clip_thresh)
|
||||||
|
is_overflow = math.isnan(grad_norm)
|
||||||
|
else:
|
||||||
|
grad_norm = torch.nn.utils.clip_grad_norm_(
|
||||||
model.parameters(), hparams.grad_clip_thresh)
|
model.parameters(), hparams.grad_clip_thresh)
|
||||||
|
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
|
||||||
overflow = optimizer.overflow if hparams.fp16_run else False
|
if not is_overflow and rank == 0:
|
||||||
|
|
||||||
if not overflow and not math.isnan(reduced_loss) and rank == 0:
|
|
||||||
duration = time.perf_counter() - start
|
duration = time.perf_counter() - start
|
||||||
print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
|
print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
|
||||||
iteration, reduced_loss, grad_norm, duration))
|
iteration, reduced_loss, grad_norm, duration))
|
||||||
|
|
||||||
logger.log_training(
|
logger.log_training(
|
||||||
reduced_loss, grad_norm, learning_rate, duration, iteration)
|
reduced_loss, grad_norm, learning_rate, duration, iteration)
|
||||||
|
|
||||||
if not overflow and (iteration % hparams.iters_per_checkpoint == 0):
|
if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0):
|
||||||
reduced_val_loss = validate(
|
validate(model, criterion, valset, iteration,
|
||||||
model, criterion, valset, iteration, hparams.batch_size,
|
hparams.batch_size, n_gpus, collate_fn, logger,
|
||||||
n_gpus, collate_fn, logger, hparams.distributed_run, rank)
|
hparams.distributed_run, rank)
|
||||||
|
|
||||||
if rank == 0:
|
if rank == 0:
|
||||||
print("Validation loss {}: {:9f} ".format(
|
|
||||||
iteration, reduced_val_loss))
|
|
||||||
logger.log_validation(
|
|
||||||
reduced_val_loss, model, y, y_pred, iteration)
|
|
||||||
checkpoint_path = os.path.join(
|
checkpoint_path = os.path.join(
|
||||||
output_directory, "checkpoint_{}".format(iteration))
|
output_directory, "checkpoint_{}".format(iteration))
|
||||||
save_checkpoint(model, optimizer, learning_rate, iteration,
|
save_checkpoint(model, optimizer, learning_rate, iteration,
|
||||||
@@ -262,7 +264,7 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument('-c', '--checkpoint_path', type=str, default=None,
|
parser.add_argument('-c', '--checkpoint_path', type=str, default=None,
|
||||||
required=False, help='checkpoint path')
|
required=False, help='checkpoint path')
|
||||||
parser.add_argument('--warm_start', action='store_true',
|
parser.add_argument('--warm_start', action='store_true',
|
||||||
help='load the model only (warm start)')
|
help='load model weights only, ignore specified layers')
|
||||||
parser.add_argument('--n_gpus', type=int, default=1,
|
parser.add_argument('--n_gpus', type=int, default=1,
|
||||||
required=False, help='number of gpus')
|
required=False, help='number of gpus')
|
||||||
parser.add_argument('--rank', type=int, default=0,
|
parser.add_argument('--rank', type=int, default=0,
|
||||||
|
|||||||
21
utils.py
21
utils.py
@@ -4,29 +4,26 @@ import torch
|
|||||||
|
|
||||||
|
|
||||||
def get_mask_from_lengths(lengths):
|
def get_mask_from_lengths(lengths):
|
||||||
max_len = torch.max(lengths)
|
max_len = torch.max(lengths).item()
|
||||||
ids = torch.arange(0, max_len).long().cuda()
|
ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len))
|
||||||
mask = (ids < lengths.unsqueeze(1)).byte()
|
mask = (ids < lengths.unsqueeze(1)).byte()
|
||||||
return mask
|
return mask
|
||||||
|
|
||||||
|
|
||||||
def load_wav_to_torch(full_path, sr):
|
def load_wav_to_torch(full_path):
|
||||||
sampling_rate, data = read(full_path)
|
sampling_rate, data = read(full_path)
|
||||||
assert sr == sampling_rate, "{} SR doesn't match {} on path {}".format(
|
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
|
||||||
sr, sampling_rate, full_path)
|
|
||||||
return torch.FloatTensor(data.astype(np.float32))
|
|
||||||
|
|
||||||
|
|
||||||
def load_filepaths_and_text(filename, sort_by_length, split="|"):
|
def load_filepaths_and_text(filename, split="|"):
|
||||||
with open(filename, encoding='utf-8') as f:
|
with open(filename, encoding='utf-8') as f:
|
||||||
filepaths_and_text = [line.strip().split(split) for line in f]
|
filepaths_and_text = [line.strip().split(split) for line in f]
|
||||||
|
|
||||||
if sort_by_length:
|
|
||||||
filepaths_and_text.sort(key=lambda x: len(x[1]))
|
|
||||||
|
|
||||||
return filepaths_and_text
|
return filepaths_and_text
|
||||||
|
|
||||||
|
|
||||||
def to_gpu(x):
|
def to_gpu(x):
|
||||||
x = x.contiguous().cuda(async=True)
|
x = x.contiguous()
|
||||||
|
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
x = x.cuda(non_blocking=True)
|
||||||
return torch.autograd.Variable(x)
|
return torch.autograd.Variable(x)
|
||||||
|
|||||||
1
waveglow
Submodule
1
waveglow
Submodule
Submodule waveglow added at 4b1001fa33
Reference in New Issue
Block a user