mirror of https://github.com/malarinv/tacotron2
integrate tacotron2/waveglow based tts server
parent
4be2475cc1
commit
5f75aa0a0d
29
LICENSE
29
LICENSE
|
|
@ -1,29 +0,0 @@
|
|||
BSD 3-Clause License
|
||||
|
||||
Copyright (c) 2018, NVIDIA Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
23
README.md
23
README.md
|
|
@ -1,23 +0,0 @@
|
|||
|
||||
## Setup
|
||||
- clone the repo
|
||||
|
||||
`git clone https://github.com/agaralabs/tacotron2`
|
||||
- cd to `tacotron2` copy models from wolverine:
|
||||
|
||||
`scp wolverine:/home/ubuntu/tacotron2/{checkpoint_15000,waveglow_256channels.pt} ./`
|
||||
|
||||
`scp wolverine:/home/ubuntu/tacotron2/waveglow ./`
|
||||
|
||||
**Wolverine Details:**
|
||||
```
|
||||
Host wolverine
|
||||
Hostname 54.71.137.17
|
||||
User ubuntu
|
||||
IdentityFile ~/.ssh/id_hip_ml
|
||||
```
|
||||
install the dependencies
|
||||
`pip install requirements.txt`
|
||||
|
||||
## Running:
|
||||
`python final.py`
|
||||
|
|
@ -1,93 +0,0 @@
|
|||
import torch
|
||||
import numpy as np
|
||||
from scipy.signal import get_window
|
||||
import librosa.util as librosa_util
|
||||
|
||||
|
||||
def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
|
||||
n_fft=800, dtype=np.float32, norm=None):
|
||||
"""
|
||||
# from librosa 0.6
|
||||
Compute the sum-square envelope of a window function at a given hop length.
|
||||
|
||||
This is used to estimate modulation effects induced by windowing
|
||||
observations in short-time fourier transforms.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
window : string, tuple, number, callable, or list-like
|
||||
Window specification, as in `get_window`
|
||||
|
||||
n_frames : int > 0
|
||||
The number of analysis frames
|
||||
|
||||
hop_length : int > 0
|
||||
The number of samples to advance between frames
|
||||
|
||||
win_length : [optional]
|
||||
The length of the window function. By default, this matches `n_fft`.
|
||||
|
||||
n_fft : int > 0
|
||||
The length of each analysis frame.
|
||||
|
||||
dtype : np.dtype
|
||||
The data type of the output
|
||||
|
||||
Returns
|
||||
-------
|
||||
wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
|
||||
The sum-squared envelope of the window function
|
||||
"""
|
||||
if win_length is None:
|
||||
win_length = n_fft
|
||||
|
||||
n = n_fft + hop_length * (n_frames - 1)
|
||||
x = np.zeros(n, dtype=dtype)
|
||||
|
||||
# Compute the squared window at the desired length
|
||||
win_sq = get_window(window, win_length, fftbins=True)
|
||||
win_sq = librosa_util.normalize(win_sq, norm=norm)**2
|
||||
win_sq = librosa_util.pad_center(win_sq, n_fft)
|
||||
|
||||
# Fill the envelope
|
||||
for i in range(n_frames):
|
||||
sample = i * hop_length
|
||||
x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
|
||||
return x
|
||||
|
||||
|
||||
def griffin_lim(magnitudes, stft_fn, n_iters=30):
|
||||
"""
|
||||
PARAMS
|
||||
------
|
||||
magnitudes: spectrogram magnitudes
|
||||
stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
|
||||
"""
|
||||
|
||||
angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
|
||||
angles = angles.astype(np.float32)
|
||||
angles = torch.autograd.Variable(torch.from_numpy(angles))
|
||||
signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
|
||||
|
||||
for i in range(n_iters):
|
||||
_, angles = stft_fn.transform(signal)
|
||||
signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
|
||||
return signal
|
||||
|
||||
|
||||
def dynamic_range_compression(x, C=1, clip_val=1e-5):
|
||||
"""
|
||||
PARAMS
|
||||
------
|
||||
C: compression factor
|
||||
"""
|
||||
return torch.log(torch.clamp(x, min=clip_val) * C)
|
||||
|
||||
|
||||
def dynamic_range_decompression(x, C=1):
|
||||
"""
|
||||
PARAMS
|
||||
------
|
||||
C: compression factor used to compress
|
||||
"""
|
||||
return torch.exp(x) / C
|
||||
28
corpus.txt
28
corpus.txt
|
|
@ -1,28 +0,0 @@
|
|||
Thank you for calling Pampers.
|
||||
How may I help you today?
|
||||
I understand your frustration and disappointment.
|
||||
I'm sorry it's happening and I'd like to help prevent it in the future.
|
||||
What style of Baby Dry did you buy?
|
||||
Was it the Regular or the Flexible?
|
||||
I have all the information I need about the specifics of the product you purchased now.
|
||||
Thank you for your patience!
|
||||
How many diapers came in the package???
|
||||
And what size were they?
|
||||
Were they the small. The medium or the large ones???
|
||||
Sorry, without the size and count information I will be able to reimburse you with only a minimum fulfillment.
|
||||
Would that be okay??
|
||||
So you bought the Pampers Baby Dry and there were 32 diapers in the package.
|
||||
Is that correct?
|
||||
Thank you for all that information.'
|
||||
I will definitely pass on your detailed feedback to our Quality Control Team!
|
||||
I could also suggest a different variant of diapers, that might better suit your needs.
|
||||
Would you like me to help you with that?????
|
||||
How old is your little one?
|
||||
And do you have specific diaper needs that you can help me with??
|
||||
Our cruzers are made especially for active babies, and I would definitely suggest them since you said your little one moves around a lot.
|
||||
What I can do for you is, I can also include a coupon towards your next purchase of Pampers products.
|
||||
Shall I go ahead with this?
|
||||
I can send you a link over text message,,, from which you can directly order this.
|
||||
Is there anything else I can help you with today?
|
||||
Thank you for reaching out to us.
|
||||
Have a good day!!! Bye!
|
||||
111
data_utils.py
111
data_utils.py
|
|
@ -1,111 +0,0 @@
|
|||
import random
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.utils.data
|
||||
|
||||
import layers
|
||||
from utils import load_wav_to_torch, load_filepaths_and_text
|
||||
from text import text_to_sequence
|
||||
|
||||
|
||||
class TextMelLoader(torch.utils.data.Dataset):
|
||||
"""
|
||||
1) loads audio,text pairs
|
||||
2) normalizes text and converts them to sequences of one-hot vectors
|
||||
3) computes mel-spectrograms from audio files.
|
||||
"""
|
||||
def __init__(self, audiopaths_and_text, hparams):
|
||||
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
|
||||
self.text_cleaners = hparams.text_cleaners
|
||||
self.max_wav_value = hparams.max_wav_value
|
||||
self.sampling_rate = hparams.sampling_rate
|
||||
self.load_mel_from_disk = hparams.load_mel_from_disk
|
||||
self.stft = layers.TacotronSTFT(
|
||||
hparams.filter_length, hparams.hop_length, hparams.win_length,
|
||||
hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
|
||||
hparams.mel_fmax)
|
||||
random.seed(1234)
|
||||
random.shuffle(self.audiopaths_and_text)
|
||||
|
||||
def get_mel_text_pair(self, audiopath_and_text):
|
||||
# separate filename and text
|
||||
audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
|
||||
text = self.get_text(text)
|
||||
mel = self.get_mel(audiopath)
|
||||
return (text, mel)
|
||||
|
||||
def get_mel(self, filename):
|
||||
if not self.load_mel_from_disk:
|
||||
audio, sampling_rate = load_wav_to_torch(filename)
|
||||
if sampling_rate != self.stft.sampling_rate:
|
||||
raise ValueError("{} {} SR doesn't match target {} SR".format(
|
||||
sampling_rate, self.stft.sampling_rate))
|
||||
audio_norm = audio / self.max_wav_value
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
|
||||
melspec = self.stft.mel_spectrogram(audio_norm)
|
||||
melspec = torch.squeeze(melspec, 0)
|
||||
else:
|
||||
melspec = torch.from_numpy(np.load(filename))
|
||||
assert melspec.size(0) == self.stft.n_mel_channels, (
|
||||
'Mel dimension mismatch: given {}, expected {}'.format(
|
||||
melspec.size(0), self.stft.n_mel_channels))
|
||||
|
||||
return melspec
|
||||
|
||||
def get_text(self, text):
|
||||
text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners))
|
||||
return text_norm
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self.get_mel_text_pair(self.audiopaths_and_text[index])
|
||||
|
||||
def __len__(self):
|
||||
return len(self.audiopaths_and_text)
|
||||
|
||||
|
||||
class TextMelCollate():
|
||||
""" Zero-pads model inputs and targets based on number of frames per setep
|
||||
"""
|
||||
def __init__(self, n_frames_per_step):
|
||||
self.n_frames_per_step = n_frames_per_step
|
||||
|
||||
def __call__(self, batch):
|
||||
"""Collate's training batch from normalized text and mel-spectrogram
|
||||
PARAMS
|
||||
------
|
||||
batch: [text_normalized, mel_normalized]
|
||||
"""
|
||||
# Right zero-pad all one-hot text sequences to max input length
|
||||
input_lengths, ids_sorted_decreasing = torch.sort(
|
||||
torch.LongTensor([len(x[0]) for x in batch]),
|
||||
dim=0, descending=True)
|
||||
max_input_len = input_lengths[0]
|
||||
|
||||
text_padded = torch.LongTensor(len(batch), max_input_len)
|
||||
text_padded.zero_()
|
||||
for i in range(len(ids_sorted_decreasing)):
|
||||
text = batch[ids_sorted_decreasing[i]][0]
|
||||
text_padded[i, :text.size(0)] = text
|
||||
|
||||
# Right zero-pad mel-spec
|
||||
num_mels = batch[0][1].size(0)
|
||||
max_target_len = max([x[1].size(1) for x in batch])
|
||||
if max_target_len % self.n_frames_per_step != 0:
|
||||
max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
|
||||
assert max_target_len % self.n_frames_per_step == 0
|
||||
|
||||
# include mel padded and gate padded
|
||||
mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len)
|
||||
mel_padded.zero_()
|
||||
gate_padded = torch.FloatTensor(len(batch), max_target_len)
|
||||
gate_padded.zero_()
|
||||
output_lengths = torch.LongTensor(len(batch))
|
||||
for i in range(len(ids_sorted_decreasing)):
|
||||
mel = batch[ids_sorted_decreasing[i]][1]
|
||||
mel_padded[i, :, :mel.size(1)] = mel
|
||||
gate_padded[i, mel.size(1)-1:] = 1
|
||||
output_lengths[i] = mel.size(1)
|
||||
|
||||
return text_padded, input_lengths, mel_padded, gate_padded, \
|
||||
output_lengths
|
||||
|
|
@ -1,27 +0,0 @@
|
|||
import grpc
|
||||
from sia.proto import tts_pb2
|
||||
from sia.proto import tts_pb2_grpc
|
||||
from tts import player_gen
|
||||
|
||||
|
||||
def tts_player():
|
||||
player = player_gen()
|
||||
channel = grpc.insecure_channel('localhost:50060')
|
||||
stub = tts_pb2_grpc.ServerStub(channel)
|
||||
|
||||
def play(t):
|
||||
test_text = tts_pb2.TextInput(text=t)
|
||||
speech = stub.TextToSpeechAPI(test_text)
|
||||
player(speech.response)
|
||||
return play
|
||||
|
||||
|
||||
def main():
|
||||
play = tts_player()
|
||||
play('How may I help you today?')
|
||||
import ipdb
|
||||
ipdb.set_trace()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
173
distributed.py
173
distributed.py
|
|
@ -1,173 +0,0 @@
|
|||
import torch
|
||||
import torch.distributed as dist
|
||||
from torch.nn.modules import Module
|
||||
from torch.autograd import Variable
|
||||
|
||||
def _flatten_dense_tensors(tensors):
|
||||
"""Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
|
||||
same dense type.
|
||||
Since inputs are dense, the resulting tensor will be a concatenated 1D
|
||||
buffer. Element-wise operation on this buffer will be equivalent to
|
||||
operating individually.
|
||||
Arguments:
|
||||
tensors (Iterable[Tensor]): dense tensors to flatten.
|
||||
Returns:
|
||||
A contiguous 1D buffer containing input tensors.
|
||||
"""
|
||||
if len(tensors) == 1:
|
||||
return tensors[0].contiguous().view(-1)
|
||||
flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
|
||||
return flat
|
||||
|
||||
def _unflatten_dense_tensors(flat, tensors):
|
||||
"""View a flat buffer using the sizes of tensors. Assume that tensors are of
|
||||
same dense type, and that flat is given by _flatten_dense_tensors.
|
||||
Arguments:
|
||||
flat (Tensor): flattened dense tensors to unflatten.
|
||||
tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
|
||||
unflatten flat.
|
||||
Returns:
|
||||
Unflattened dense tensors with sizes same as tensors and values from
|
||||
flat.
|
||||
"""
|
||||
outputs = []
|
||||
offset = 0
|
||||
for tensor in tensors:
|
||||
numel = tensor.numel()
|
||||
outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
|
||||
offset += numel
|
||||
return tuple(outputs)
|
||||
|
||||
|
||||
'''
|
||||
This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py
|
||||
launcher included with this example. It assumes that your run is using multiprocess with 1
|
||||
GPU/process, that the model is on the correct device, and that torch.set_device has been
|
||||
used to set the device.
|
||||
|
||||
Parameters are broadcasted to the other processes on initialization of DistributedDataParallel,
|
||||
and will be allreduced at the finish of the backward pass.
|
||||
'''
|
||||
class DistributedDataParallel(Module):
|
||||
|
||||
def __init__(self, module):
|
||||
super(DistributedDataParallel, self).__init__()
|
||||
#fallback for PyTorch 0.3
|
||||
if not hasattr(dist, '_backend'):
|
||||
self.warn_on_half = True
|
||||
else:
|
||||
self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
|
||||
|
||||
self.module = module
|
||||
|
||||
for p in self.module.state_dict().values():
|
||||
if not torch.is_tensor(p):
|
||||
continue
|
||||
dist.broadcast(p, 0)
|
||||
|
||||
def allreduce_params():
|
||||
if(self.needs_reduction):
|
||||
self.needs_reduction = False
|
||||
buckets = {}
|
||||
for param in self.module.parameters():
|
||||
if param.requires_grad and param.grad is not None:
|
||||
tp = type(param.data)
|
||||
if tp not in buckets:
|
||||
buckets[tp] = []
|
||||
buckets[tp].append(param)
|
||||
if self.warn_on_half:
|
||||
if torch.cuda.HalfTensor in buckets:
|
||||
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
|
||||
" It is recommended to use the NCCL backend in this case. This currently requires" +
|
||||
"PyTorch built from top of tree master.")
|
||||
self.warn_on_half = False
|
||||
|
||||
for tp in buckets:
|
||||
bucket = buckets[tp]
|
||||
grads = [param.grad.data for param in bucket]
|
||||
coalesced = _flatten_dense_tensors(grads)
|
||||
dist.all_reduce(coalesced)
|
||||
coalesced /= dist.get_world_size()
|
||||
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
|
||||
buf.copy_(synced)
|
||||
|
||||
for param in list(self.module.parameters()):
|
||||
def allreduce_hook(*unused):
|
||||
param._execution_engine.queue_callback(allreduce_params)
|
||||
if param.requires_grad:
|
||||
param.register_hook(allreduce_hook)
|
||||
|
||||
def forward(self, *inputs, **kwargs):
|
||||
self.needs_reduction = True
|
||||
return self.module(*inputs, **kwargs)
|
||||
|
||||
'''
|
||||
def _sync_buffers(self):
|
||||
buffers = list(self.module._all_buffers())
|
||||
if len(buffers) > 0:
|
||||
# cross-node buffer sync
|
||||
flat_buffers = _flatten_dense_tensors(buffers)
|
||||
dist.broadcast(flat_buffers, 0)
|
||||
for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
|
||||
buf.copy_(synced)
|
||||
def train(self, mode=True):
|
||||
# Clear NCCL communicator and CUDA event cache of the default group ID,
|
||||
# These cache will be recreated at the later call. This is currently a
|
||||
# work-around for a potential NCCL deadlock.
|
||||
if dist._backend == dist.dist_backend.NCCL:
|
||||
dist._clear_group_cache()
|
||||
super(DistributedDataParallel, self).train(mode)
|
||||
self.module.train(mode)
|
||||
'''
|
||||
'''
|
||||
Modifies existing model to do gradient allreduce, but doesn't change class
|
||||
so you don't need "module"
|
||||
'''
|
||||
def apply_gradient_allreduce(module):
|
||||
if not hasattr(dist, '_backend'):
|
||||
module.warn_on_half = True
|
||||
else:
|
||||
module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
|
||||
|
||||
for p in module.state_dict().values():
|
||||
if not torch.is_tensor(p):
|
||||
continue
|
||||
dist.broadcast(p, 0)
|
||||
|
||||
def allreduce_params():
|
||||
if(module.needs_reduction):
|
||||
module.needs_reduction = False
|
||||
buckets = {}
|
||||
for param in module.parameters():
|
||||
if param.requires_grad and param.grad is not None:
|
||||
tp = param.data.dtype
|
||||
if tp not in buckets:
|
||||
buckets[tp] = []
|
||||
buckets[tp].append(param)
|
||||
if module.warn_on_half:
|
||||
if torch.cuda.HalfTensor in buckets:
|
||||
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
|
||||
" It is recommended to use the NCCL backend in this case. This currently requires" +
|
||||
"PyTorch built from top of tree master.")
|
||||
module.warn_on_half = False
|
||||
|
||||
for tp in buckets:
|
||||
bucket = buckets[tp]
|
||||
grads = [param.grad.data for param in bucket]
|
||||
coalesced = _flatten_dense_tensors(grads)
|
||||
dist.all_reduce(coalesced)
|
||||
coalesced /= dist.get_world_size()
|
||||
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
|
||||
buf.copy_(synced)
|
||||
|
||||
for param in list(module.parameters()):
|
||||
def allreduce_hook(*unused):
|
||||
Variable._execution_engine.queue_callback(allreduce_params)
|
||||
if param.requires_grad:
|
||||
param.register_hook(allreduce_hook)
|
||||
|
||||
def set_needs_reduction(self, input, output):
|
||||
self.needs_reduction = True
|
||||
|
||||
module.register_forward_hook(set_needs_reduction)
|
||||
return module
|
||||
311
glow.py
311
glow.py
|
|
@ -1,311 +0,0 @@
|
|||
# *****************************************************************************
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of the NVIDIA CORPORATION nor the
|
||||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# *****************************************************************************
|
||||
import copy
|
||||
import torch
|
||||
from torch.autograd import Variable
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
@torch.jit.script
|
||||
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
||||
n_channels_int = n_channels[0]
|
||||
in_act = input_a+input_b
|
||||
t_act = torch.nn.functional.tanh(in_act[:, :n_channels_int, :])
|
||||
s_act = torch.nn.functional.sigmoid(in_act[:, n_channels_int:, :])
|
||||
acts = t_act * s_act
|
||||
return acts
|
||||
|
||||
|
||||
class WaveGlowLoss(torch.nn.Module):
|
||||
def __init__(self, sigma=1.0):
|
||||
super(WaveGlowLoss, self).__init__()
|
||||
self.sigma = sigma
|
||||
|
||||
def forward(self, model_output):
|
||||
z, log_s_list, log_det_W_list = model_output
|
||||
for i, log_s in enumerate(log_s_list):
|
||||
if i == 0:
|
||||
log_s_total = torch.sum(log_s)
|
||||
log_det_W_total = log_det_W_list[i]
|
||||
else:
|
||||
log_s_total = log_s_total + torch.sum(log_s)
|
||||
log_det_W_total += log_det_W_list[i]
|
||||
|
||||
loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total
|
||||
return loss/(z.size(0)*z.size(1)*z.size(2))
|
||||
|
||||
|
||||
class Invertible1x1Conv(torch.nn.Module):
|
||||
"""
|
||||
The layer outputs both the convolution, and the log determinant
|
||||
of its weight matrix. If reverse=True it does convolution with
|
||||
inverse
|
||||
"""
|
||||
def __init__(self, c):
|
||||
super(Invertible1x1Conv, self).__init__()
|
||||
self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0,
|
||||
bias=False)
|
||||
|
||||
# Sample a random orthonormal matrix to initialize weights
|
||||
W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
|
||||
|
||||
# Ensure determinant is 1.0 not -1.0
|
||||
if torch.det(W) < 0:
|
||||
W[:,0] = -1*W[:,0]
|
||||
W = W.view(c, c, 1)
|
||||
self.conv.weight.data = W
|
||||
|
||||
def forward(self, z, reverse=False):
|
||||
# shape
|
||||
batch_size, group_size, n_of_groups = z.size()
|
||||
|
||||
W = self.conv.weight.squeeze()
|
||||
|
||||
if reverse:
|
||||
if not hasattr(self, 'W_inverse'):
|
||||
# Reverse computation
|
||||
W_inverse = W.inverse()
|
||||
W_inverse = Variable(W_inverse[..., None])
|
||||
if z.type() == 'torch.cuda.HalfTensor':
|
||||
W_inverse = W_inverse.half()
|
||||
self.W_inverse = W_inverse
|
||||
z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
|
||||
return z
|
||||
else:
|
||||
# Forward computation
|
||||
log_det_W = batch_size * n_of_groups * torch.logdet(W)
|
||||
z = self.conv(z)
|
||||
return z, log_det_W
|
||||
|
||||
|
||||
class WN(torch.nn.Module):
|
||||
"""
|
||||
This is the WaveNet like layer for the affine coupling. The primary difference
|
||||
from WaveNet is the convolutions need not be causal. There is also no dilation
|
||||
size reset. The dilation only doubles on each layer
|
||||
"""
|
||||
def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
|
||||
kernel_size):
|
||||
super(WN, self).__init__()
|
||||
assert(kernel_size % 2 == 1)
|
||||
assert(n_channels % 2 == 0)
|
||||
self.n_layers = n_layers
|
||||
self.n_channels = n_channels
|
||||
self.in_layers = torch.nn.ModuleList()
|
||||
self.res_skip_layers = torch.nn.ModuleList()
|
||||
self.cond_layers = torch.nn.ModuleList()
|
||||
|
||||
start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
|
||||
start = torch.nn.utils.weight_norm(start, name='weight')
|
||||
self.start = start
|
||||
|
||||
# Initializing last layer to 0 makes the affine coupling layers
|
||||
# do nothing at first. This helps with training stability
|
||||
end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1)
|
||||
end.weight.data.zero_()
|
||||
end.bias.data.zero_()
|
||||
self.end = end
|
||||
|
||||
for i in range(n_layers):
|
||||
dilation = 2 ** i
|
||||
padding = int((kernel_size*dilation - dilation)/2)
|
||||
in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size,
|
||||
dilation=dilation, padding=padding)
|
||||
in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
|
||||
self.in_layers.append(in_layer)
|
||||
|
||||
cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels, 1)
|
||||
cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
|
||||
self.cond_layers.append(cond_layer)
|
||||
|
||||
# last one is not necessary
|
||||
if i < n_layers - 1:
|
||||
res_skip_channels = 2*n_channels
|
||||
else:
|
||||
res_skip_channels = n_channels
|
||||
res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
|
||||
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
|
||||
self.res_skip_layers.append(res_skip_layer)
|
||||
|
||||
def forward(self, forward_input):
|
||||
audio, spect = forward_input
|
||||
audio = self.start(audio)
|
||||
for i in range(self.n_layers):
|
||||
acts = fused_add_tanh_sigmoid_multiply(
|
||||
self.in_layers[i](audio),
|
||||
self.cond_layers[i](spect),
|
||||
torch.IntTensor([self.n_channels]))
|
||||
|
||||
res_skip_acts = self.res_skip_layers[i](acts)
|
||||
if i < self.n_layers - 1:
|
||||
audio = res_skip_acts[:,:self.n_channels,:] + audio
|
||||
skip_acts = res_skip_acts[:,self.n_channels:,:]
|
||||
else:
|
||||
skip_acts = res_skip_acts
|
||||
|
||||
if i == 0:
|
||||
output = skip_acts
|
||||
else:
|
||||
output = skip_acts + output
|
||||
return self.end(output)
|
||||
|
||||
|
||||
class WaveGlow(torch.nn.Module):
|
||||
def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
|
||||
n_early_size, WN_config):
|
||||
super(WaveGlow, self).__init__()
|
||||
|
||||
self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
|
||||
n_mel_channels,
|
||||
1024, stride=256)
|
||||
assert(n_group % 2 == 0)
|
||||
self.n_flows = n_flows
|
||||
self.n_group = n_group
|
||||
self.n_early_every = n_early_every
|
||||
self.n_early_size = n_early_size
|
||||
self.WN = torch.nn.ModuleList()
|
||||
self.convinv = torch.nn.ModuleList()
|
||||
|
||||
n_half = int(n_group/2)
|
||||
|
||||
# Set up layers with the right sizes based on how many dimensions
|
||||
# have been output already
|
||||
n_remaining_channels = n_group
|
||||
for k in range(n_flows):
|
||||
if k % self.n_early_every == 0 and k > 0:
|
||||
n_half = n_half - int(self.n_early_size/2)
|
||||
n_remaining_channels = n_remaining_channels - self.n_early_size
|
||||
self.convinv.append(Invertible1x1Conv(n_remaining_channels))
|
||||
self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config))
|
||||
self.n_remaining_channels = n_remaining_channels # Useful during inference
|
||||
|
||||
def forward(self, forward_input):
|
||||
"""
|
||||
forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames
|
||||
forward_input[1] = audio: batch x time
|
||||
"""
|
||||
spect, audio = forward_input
|
||||
|
||||
# Upsample spectrogram to size of audio
|
||||
spect = self.upsample(spect)
|
||||
assert(spect.size(2) >= audio.size(1))
|
||||
if spect.size(2) > audio.size(1):
|
||||
spect = spect[:, :, :audio.size(1)]
|
||||
|
||||
spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
|
||||
spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
|
||||
|
||||
audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
|
||||
output_audio = []
|
||||
log_s_list = []
|
||||
log_det_W_list = []
|
||||
|
||||
for k in range(self.n_flows):
|
||||
if k % self.n_early_every == 0 and k > 0:
|
||||
output_audio.append(audio[:,:self.n_early_size,:])
|
||||
audio = audio[:,self.n_early_size:,:]
|
||||
|
||||
audio, log_det_W = self.convinv[k](audio)
|
||||
log_det_W_list.append(log_det_W)
|
||||
|
||||
n_half = int(audio.size(1)/2)
|
||||
audio_0 = audio[:,:n_half,:]
|
||||
audio_1 = audio[:,n_half:,:]
|
||||
|
||||
output = self.WN[k]((audio_0, spect))
|
||||
log_s = output[:, n_half:, :]
|
||||
b = output[:, :n_half, :]
|
||||
audio_1 = torch.exp(log_s)*audio_1 + b
|
||||
log_s_list.append(log_s)
|
||||
|
||||
audio = torch.cat([audio_0, audio_1],1)
|
||||
|
||||
output_audio.append(audio)
|
||||
return torch.cat(output_audio,1), log_s_list, log_det_W_list
|
||||
|
||||
def infer(self, spect, sigma=1.0):
|
||||
spect = self.upsample(spect)
|
||||
# trim conv artifacts. maybe pad spec to kernel multiple
|
||||
time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
|
||||
spect = spect[:, :, :-time_cutoff]
|
||||
|
||||
spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
|
||||
spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
|
||||
|
||||
if spect.type() == 'torch.cuda.HalfTensor':
|
||||
audio = torch.cuda.HalfTensor(spect.size(0),
|
||||
self.n_remaining_channels,
|
||||
spect.size(2)).normal_()
|
||||
else:
|
||||
# cuda.FloatTensor -> FloatTensor
|
||||
audio = torch.FloatTensor(spect.size(0),
|
||||
self.n_remaining_channels,
|
||||
spect.size(2)).normal_()
|
||||
|
||||
audio = torch.autograd.Variable(sigma*audio)
|
||||
|
||||
for k in reversed(range(self.n_flows)):
|
||||
n_half = int(audio.size(1)/2)
|
||||
audio_0 = audio[:,:n_half,:]
|
||||
audio_1 = audio[:,n_half:,:]
|
||||
|
||||
output = self.WN[k]((audio_0, spect))
|
||||
s = output[:, n_half:, :]
|
||||
b = output[:, :n_half, :]
|
||||
audio_1 = (audio_1 - b)/torch.exp(s)
|
||||
audio = torch.cat([audio_0, audio_1],1)
|
||||
|
||||
audio = self.convinv[k](audio, reverse=True)
|
||||
|
||||
if k % self.n_early_every == 0 and k > 0:
|
||||
if spect.type() == 'torch.cuda.HalfTensor':
|
||||
z = torch.cuda.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
|
||||
else:
|
||||
# cuda.FloatTensor -> FloatTensor
|
||||
z = torch.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
|
||||
audio = torch.cat((sigma*z, audio),1)
|
||||
|
||||
audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
|
||||
return audio
|
||||
|
||||
@staticmethod
|
||||
def remove_weightnorm(model):
|
||||
waveglow = model
|
||||
for WN in waveglow.WN:
|
||||
WN.start = torch.nn.utils.remove_weight_norm(WN.start)
|
||||
WN.in_layers = remove(WN.in_layers)
|
||||
WN.cond_layers = remove(WN.cond_layers)
|
||||
WN.res_skip_layers = remove(WN.res_skip_layers)
|
||||
return waveglow
|
||||
|
||||
|
||||
def remove(conv_list):
|
||||
new_conv_list = torch.nn.ModuleList()
|
||||
for old_conv in conv_list:
|
||||
old_conv = torch.nn.utils.remove_weight_norm(old_conv)
|
||||
new_conv_list.append(old_conv)
|
||||
return new_conv_list
|
||||
96
hparams.py
96
hparams.py
|
|
@ -1,96 +0,0 @@
|
|||
import tensorflow as tf
|
||||
from text import symbols
|
||||
|
||||
|
||||
# changed path, sampling rate and batch size
|
||||
def create_hparams(hparams_string=None, verbose=False):
|
||||
"""Create model hyperparameters. Parse nondefault from given string."""
|
||||
|
||||
hparams = tf.contrib.training.HParams(
|
||||
################################
|
||||
# Experiment Parameters #
|
||||
################################
|
||||
epochs=500,
|
||||
iters_per_checkpoint=1000,
|
||||
seed=1234,
|
||||
dynamic_loss_scaling=True,
|
||||
fp16_run=False,
|
||||
distributed_run=False,
|
||||
dist_backend="nccl",
|
||||
dist_url="tcp://localhost:54321",
|
||||
cudnn_enabled=True,
|
||||
cudnn_benchmark=False,
|
||||
ignore_layers=['embedding.weight'],
|
||||
|
||||
################################
|
||||
# Data Parameters #
|
||||
################################
|
||||
load_mel_from_disk=False,
|
||||
training_files='lists/tts_data_train_processed.txt',
|
||||
validation_files='filelists/tts_data_val_processed.txt',
|
||||
text_cleaners=['english_cleaners'],
|
||||
|
||||
################################
|
||||
# Audio Parameters #
|
||||
################################
|
||||
max_wav_value=32768.0,
|
||||
sampling_rate=16000,
|
||||
filter_length=1024,
|
||||
hop_length=256,
|
||||
win_length=1024,
|
||||
n_mel_channels=80,
|
||||
mel_fmin=0.0,
|
||||
mel_fmax=8000.0,
|
||||
|
||||
################################
|
||||
# Model Parameters #
|
||||
################################
|
||||
n_symbols=len(symbols),
|
||||
symbols_embedding_dim=512,
|
||||
|
||||
# Encoder parameters
|
||||
encoder_kernel_size=5,
|
||||
encoder_n_convolutions=3,
|
||||
encoder_embedding_dim=512,
|
||||
|
||||
# Decoder parameters
|
||||
n_frames_per_step=1, # currently only 1 is supported
|
||||
decoder_rnn_dim=1024,
|
||||
prenet_dim=256,
|
||||
max_decoder_steps=1000,
|
||||
gate_threshold=0.5,
|
||||
p_attention_dropout=0.1,
|
||||
p_decoder_dropout=0.1,
|
||||
|
||||
# Attention parameters
|
||||
attention_rnn_dim=1024,
|
||||
attention_dim=128,
|
||||
|
||||
# Location Layer parameters
|
||||
attention_location_n_filters=32,
|
||||
attention_location_kernel_size=31,
|
||||
|
||||
# Mel-post processing network parameters
|
||||
postnet_embedding_dim=512,
|
||||
postnet_kernel_size=5,
|
||||
postnet_n_convolutions=5,
|
||||
|
||||
################################
|
||||
# Optimization Hyperparameters #
|
||||
################################
|
||||
use_saved_learning_rate=False,
|
||||
learning_rate=1e-3,
|
||||
weight_decay=1e-6,
|
||||
grad_clip_thresh=1.0,
|
||||
batch_size=4,
|
||||
mask_padding=True # set model's padded outputs to padded values
|
||||
)
|
||||
|
||||
if hparams_string:
|
||||
tf.logging.info('Parsing command line hparams: %s', hparams_string)
|
||||
hparams.parse(hparams_string)
|
||||
|
||||
if verbose:
|
||||
tf.logging.info('Final parsed hparams: %s', hparams.values())
|
||||
|
||||
return hparams
|
||||
80
layers.py
80
layers.py
|
|
@ -1,80 +0,0 @@
|
|||
import torch
|
||||
from librosa.filters import mel as librosa_mel_fn
|
||||
from audio_processing import dynamic_range_compression
|
||||
from audio_processing import dynamic_range_decompression
|
||||
from stft import STFT
|
||||
|
||||
|
||||
class LinearNorm(torch.nn.Module):
|
||||
def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
|
||||
super(LinearNorm, self).__init__()
|
||||
self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
|
||||
|
||||
torch.nn.init.xavier_uniform_(
|
||||
self.linear_layer.weight,
|
||||
gain=torch.nn.init.calculate_gain(w_init_gain))
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear_layer(x)
|
||||
|
||||
|
||||
class ConvNorm(torch.nn.Module):
|
||||
def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
|
||||
padding=None, dilation=1, bias=True, w_init_gain='linear'):
|
||||
super(ConvNorm, self).__init__()
|
||||
if padding is None:
|
||||
assert(kernel_size % 2 == 1)
|
||||
padding = int(dilation * (kernel_size - 1) / 2)
|
||||
|
||||
self.conv = torch.nn.Conv1d(in_channels, out_channels,
|
||||
kernel_size=kernel_size, stride=stride,
|
||||
padding=padding, dilation=dilation,
|
||||
bias=bias)
|
||||
|
||||
torch.nn.init.xavier_uniform_(
|
||||
self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
|
||||
|
||||
def forward(self, signal):
|
||||
conv_signal = self.conv(signal)
|
||||
return conv_signal
|
||||
|
||||
|
||||
class TacotronSTFT(torch.nn.Module):
|
||||
def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
|
||||
n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
|
||||
mel_fmax=8000.0):
|
||||
super(TacotronSTFT, self).__init__()
|
||||
self.n_mel_channels = n_mel_channels
|
||||
self.sampling_rate = sampling_rate
|
||||
self.stft_fn = STFT(filter_length, hop_length, win_length)
|
||||
mel_basis = librosa_mel_fn(
|
||||
sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
|
||||
mel_basis = torch.from_numpy(mel_basis).float()
|
||||
self.register_buffer('mel_basis', mel_basis)
|
||||
|
||||
def spectral_normalize(self, magnitudes):
|
||||
output = dynamic_range_compression(magnitudes)
|
||||
return output
|
||||
|
||||
def spectral_de_normalize(self, magnitudes):
|
||||
output = dynamic_range_decompression(magnitudes)
|
||||
return output
|
||||
|
||||
def mel_spectrogram(self, y):
|
||||
"""Computes mel-spectrograms from a batch of waves
|
||||
PARAMS
|
||||
------
|
||||
y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
|
||||
|
||||
RETURNS
|
||||
-------
|
||||
mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
|
||||
"""
|
||||
assert(torch.min(y.data) >= -1)
|
||||
assert(torch.max(y.data) <= 1)
|
||||
|
||||
magnitudes, phases = self.stft_fn.transform(y)
|
||||
magnitudes = magnitudes.data
|
||||
mel_output = torch.matmul(self.mel_basis, magnitudes)
|
||||
mel_output = self.spectral_normalize(mel_output)
|
||||
return mel_output
|
||||
48
logger.py
48
logger.py
|
|
@ -1,48 +0,0 @@
|
|||
import random
|
||||
import torch
|
||||
from tensorboardX import SummaryWriter
|
||||
from plotting_utils import plot_alignment_to_numpy, plot_spectrogram_to_numpy
|
||||
from plotting_utils import plot_gate_outputs_to_numpy
|
||||
|
||||
|
||||
class Tacotron2Logger(SummaryWriter):
|
||||
def __init__(self, logdir):
|
||||
super(Tacotron2Logger, self).__init__(logdir)
|
||||
|
||||
def log_training(self, reduced_loss, grad_norm, learning_rate, duration,
|
||||
iteration):
|
||||
self.add_scalar("training.loss", reduced_loss, iteration)
|
||||
self.add_scalar("grad.norm", grad_norm, iteration)
|
||||
self.add_scalar("learning.rate", learning_rate, iteration)
|
||||
self.add_scalar("duration", duration, iteration)
|
||||
|
||||
def log_validation(self, reduced_loss, model, y, y_pred, iteration):
|
||||
self.add_scalar("validation.loss", reduced_loss, iteration)
|
||||
_, mel_outputs, gate_outputs, alignments = y_pred
|
||||
mel_targets, gate_targets = y
|
||||
|
||||
# plot distribution of parameters
|
||||
for tag, value in model.named_parameters():
|
||||
tag = tag.replace('.', '/')
|
||||
self.add_histogram(tag, value.data.cpu().numpy(), iteration)
|
||||
|
||||
# plot alignment, mel target and predicted, gate target and predicted
|
||||
idx = random.randint(0, alignments.size(0) - 1)
|
||||
self.add_image(
|
||||
"alignment",
|
||||
plot_alignment_to_numpy(alignments[idx].data.cpu().numpy().T),
|
||||
iteration)
|
||||
self.add_image(
|
||||
"mel_target",
|
||||
plot_spectrogram_to_numpy(mel_targets[idx].data.cpu().numpy()),
|
||||
iteration)
|
||||
self.add_image(
|
||||
"mel_predicted",
|
||||
plot_spectrogram_to_numpy(mel_outputs[idx].data.cpu().numpy()),
|
||||
iteration)
|
||||
self.add_image(
|
||||
"gate",
|
||||
plot_gate_outputs_to_numpy(
|
||||
gate_targets[idx].data.cpu().numpy(),
|
||||
torch.sigmoid(gate_outputs[idx]).data.cpu().numpy()),
|
||||
iteration)
|
||||
|
|
@ -1,19 +0,0 @@
|
|||
from torch import nn
|
||||
|
||||
|
||||
class Tacotron2Loss(nn.Module):
|
||||
def __init__(self):
|
||||
super(Tacotron2Loss, self).__init__()
|
||||
|
||||
def forward(self, model_output, targets):
|
||||
mel_target, gate_target = targets[0], targets[1]
|
||||
mel_target.requires_grad = False
|
||||
gate_target.requires_grad = False
|
||||
gate_target = gate_target.view(-1, 1)
|
||||
|
||||
mel_out, mel_out_postnet, gate_out, _ = model_output
|
||||
gate_out = gate_out.view(-1, 1)
|
||||
mel_loss = nn.MSELoss()(mel_out, mel_target) + \
|
||||
nn.MSELoss()(mel_out_postnet, mel_target)
|
||||
gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
|
||||
return mel_loss + gate_loss
|
||||
529
model.py
529
model.py
|
|
@ -1,529 +0,0 @@
|
|||
from math import sqrt
|
||||
import torch
|
||||
from torch.autograd import Variable
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
from layers import ConvNorm, LinearNorm
|
||||
from utils import to_gpu, get_mask_from_lengths
|
||||
|
||||
|
||||
class LocationLayer(nn.Module):
|
||||
def __init__(self, attention_n_filters, attention_kernel_size,
|
||||
attention_dim):
|
||||
super(LocationLayer, self).__init__()
|
||||
padding = int((attention_kernel_size - 1) / 2)
|
||||
self.location_conv = ConvNorm(2, attention_n_filters,
|
||||
kernel_size=attention_kernel_size,
|
||||
padding=padding, bias=False, stride=1,
|
||||
dilation=1)
|
||||
self.location_dense = LinearNorm(attention_n_filters, attention_dim,
|
||||
bias=False, w_init_gain='tanh')
|
||||
|
||||
def forward(self, attention_weights_cat):
|
||||
processed_attention = self.location_conv(attention_weights_cat)
|
||||
processed_attention = processed_attention.transpose(1, 2)
|
||||
processed_attention = self.location_dense(processed_attention)
|
||||
return processed_attention
|
||||
|
||||
|
||||
class Attention(nn.Module):
|
||||
def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
|
||||
attention_location_n_filters, attention_location_kernel_size):
|
||||
super(Attention, self).__init__()
|
||||
self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
|
||||
bias=False, w_init_gain='tanh')
|
||||
self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
|
||||
w_init_gain='tanh')
|
||||
self.v = LinearNorm(attention_dim, 1, bias=False)
|
||||
self.location_layer = LocationLayer(attention_location_n_filters,
|
||||
attention_location_kernel_size,
|
||||
attention_dim)
|
||||
self.score_mask_value = -float("inf")
|
||||
|
||||
def get_alignment_energies(self, query, processed_memory,
|
||||
attention_weights_cat):
|
||||
"""
|
||||
PARAMS
|
||||
------
|
||||
query: decoder output (batch, n_mel_channels * n_frames_per_step)
|
||||
processed_memory: processed encoder outputs (B, T_in, attention_dim)
|
||||
attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
|
||||
|
||||
RETURNS
|
||||
-------
|
||||
alignment (batch, max_time)
|
||||
"""
|
||||
|
||||
processed_query = self.query_layer(query.unsqueeze(1))
|
||||
processed_attention_weights = self.location_layer(attention_weights_cat)
|
||||
energies = self.v(torch.tanh(
|
||||
processed_query + processed_attention_weights + processed_memory))
|
||||
|
||||
energies = energies.squeeze(-1)
|
||||
return energies
|
||||
|
||||
def forward(self, attention_hidden_state, memory, processed_memory,
|
||||
attention_weights_cat, mask):
|
||||
"""
|
||||
PARAMS
|
||||
------
|
||||
attention_hidden_state: attention rnn last output
|
||||
memory: encoder outputs
|
||||
processed_memory: processed encoder outputs
|
||||
attention_weights_cat: previous and cummulative attention weights
|
||||
mask: binary mask for padded data
|
||||
"""
|
||||
alignment = self.get_alignment_energies(
|
||||
attention_hidden_state, processed_memory, attention_weights_cat)
|
||||
|
||||
if mask is not None:
|
||||
alignment.data.masked_fill_(mask, self.score_mask_value)
|
||||
|
||||
attention_weights = F.softmax(alignment, dim=1)
|
||||
attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
|
||||
attention_context = attention_context.squeeze(1)
|
||||
|
||||
return attention_context, attention_weights
|
||||
|
||||
|
||||
class Prenet(nn.Module):
|
||||
def __init__(self, in_dim, sizes):
|
||||
super(Prenet, self).__init__()
|
||||
in_sizes = [in_dim] + sizes[:-1]
|
||||
self.layers = nn.ModuleList(
|
||||
[LinearNorm(in_size, out_size, bias=False)
|
||||
for (in_size, out_size) in zip(in_sizes, sizes)])
|
||||
|
||||
def forward(self, x):
|
||||
for linear in self.layers:
|
||||
x = F.dropout(F.relu(linear(x)), p=0.5, training=True)
|
||||
return x
|
||||
|
||||
|
||||
class Postnet(nn.Module):
|
||||
"""Postnet
|
||||
- Five 1-d convolution with 512 channels and kernel size 5
|
||||
"""
|
||||
|
||||
def __init__(self, hparams):
|
||||
super(Postnet, self).__init__()
|
||||
self.convolutions = nn.ModuleList()
|
||||
|
||||
self.convolutions.append(
|
||||
nn.Sequential(
|
||||
ConvNorm(hparams.n_mel_channels, hparams.postnet_embedding_dim,
|
||||
kernel_size=hparams.postnet_kernel_size, stride=1,
|
||||
padding=int((hparams.postnet_kernel_size - 1) / 2),
|
||||
dilation=1, w_init_gain='tanh'),
|
||||
nn.BatchNorm1d(hparams.postnet_embedding_dim))
|
||||
)
|
||||
|
||||
for i in range(1, hparams.postnet_n_convolutions - 1):
|
||||
self.convolutions.append(
|
||||
nn.Sequential(
|
||||
ConvNorm(hparams.postnet_embedding_dim,
|
||||
hparams.postnet_embedding_dim,
|
||||
kernel_size=hparams.postnet_kernel_size, stride=1,
|
||||
padding=int((hparams.postnet_kernel_size - 1) / 2),
|
||||
dilation=1, w_init_gain='tanh'),
|
||||
nn.BatchNorm1d(hparams.postnet_embedding_dim))
|
||||
)
|
||||
|
||||
self.convolutions.append(
|
||||
nn.Sequential(
|
||||
ConvNorm(hparams.postnet_embedding_dim, hparams.n_mel_channels,
|
||||
kernel_size=hparams.postnet_kernel_size, stride=1,
|
||||
padding=int((hparams.postnet_kernel_size - 1) / 2),
|
||||
dilation=1, w_init_gain='linear'),
|
||||
nn.BatchNorm1d(hparams.n_mel_channels))
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
for i in range(len(self.convolutions) - 1):
|
||||
x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training)
|
||||
x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
"""Encoder module:
|
||||
- Three 1-d convolution banks
|
||||
- Bidirectional LSTM
|
||||
"""
|
||||
def __init__(self, hparams):
|
||||
super(Encoder, self).__init__()
|
||||
|
||||
convolutions = []
|
||||
for _ in range(hparams.encoder_n_convolutions):
|
||||
conv_layer = nn.Sequential(
|
||||
ConvNorm(hparams.encoder_embedding_dim,
|
||||
hparams.encoder_embedding_dim,
|
||||
kernel_size=hparams.encoder_kernel_size, stride=1,
|
||||
padding=int((hparams.encoder_kernel_size - 1) / 2),
|
||||
dilation=1, w_init_gain='relu'),
|
||||
nn.BatchNorm1d(hparams.encoder_embedding_dim))
|
||||
convolutions.append(conv_layer)
|
||||
self.convolutions = nn.ModuleList(convolutions)
|
||||
|
||||
self.lstm = nn.LSTM(hparams.encoder_embedding_dim,
|
||||
int(hparams.encoder_embedding_dim / 2), 1,
|
||||
batch_first=True, bidirectional=True)
|
||||
|
||||
def forward(self, x, input_lengths):
|
||||
for conv in self.convolutions:
|
||||
x = F.dropout(F.relu(conv(x)), 0.5, self.training)
|
||||
|
||||
x = x.transpose(1, 2)
|
||||
|
||||
# pytorch tensor are not reversible, hence the conversion
|
||||
input_lengths = input_lengths.cpu().numpy()
|
||||
x = nn.utils.rnn.pack_padded_sequence(
|
||||
x, input_lengths, batch_first=True)
|
||||
|
||||
self.lstm.flatten_parameters()
|
||||
outputs, _ = self.lstm(x)
|
||||
|
||||
outputs, _ = nn.utils.rnn.pad_packed_sequence(
|
||||
outputs, batch_first=True)
|
||||
|
||||
return outputs
|
||||
|
||||
def inference(self, x):
|
||||
for conv in self.convolutions:
|
||||
x = F.dropout(F.relu(conv(x)), 0.5, self.training)
|
||||
|
||||
x = x.transpose(1, 2)
|
||||
|
||||
self.lstm.flatten_parameters()
|
||||
outputs, _ = self.lstm(x)
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
def __init__(self, hparams):
|
||||
super(Decoder, self).__init__()
|
||||
self.n_mel_channels = hparams.n_mel_channels
|
||||
self.n_frames_per_step = hparams.n_frames_per_step
|
||||
self.encoder_embedding_dim = hparams.encoder_embedding_dim
|
||||
self.attention_rnn_dim = hparams.attention_rnn_dim
|
||||
self.decoder_rnn_dim = hparams.decoder_rnn_dim
|
||||
self.prenet_dim = hparams.prenet_dim
|
||||
self.max_decoder_steps = hparams.max_decoder_steps
|
||||
self.gate_threshold = hparams.gate_threshold
|
||||
self.p_attention_dropout = hparams.p_attention_dropout
|
||||
self.p_decoder_dropout = hparams.p_decoder_dropout
|
||||
|
||||
self.prenet = Prenet(
|
||||
hparams.n_mel_channels * hparams.n_frames_per_step,
|
||||
[hparams.prenet_dim, hparams.prenet_dim])
|
||||
|
||||
self.attention_rnn = nn.LSTMCell(
|
||||
hparams.prenet_dim + hparams.encoder_embedding_dim,
|
||||
hparams.attention_rnn_dim)
|
||||
|
||||
self.attention_layer = Attention(
|
||||
hparams.attention_rnn_dim, hparams.encoder_embedding_dim,
|
||||
hparams.attention_dim, hparams.attention_location_n_filters,
|
||||
hparams.attention_location_kernel_size)
|
||||
|
||||
self.decoder_rnn = nn.LSTMCell(
|
||||
hparams.attention_rnn_dim + hparams.encoder_embedding_dim,
|
||||
hparams.decoder_rnn_dim, 1)
|
||||
|
||||
self.linear_projection = LinearNorm(
|
||||
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
|
||||
hparams.n_mel_channels * hparams.n_frames_per_step)
|
||||
|
||||
self.gate_layer = LinearNorm(
|
||||
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1,
|
||||
bias=True, w_init_gain='sigmoid')
|
||||
|
||||
def get_go_frame(self, memory):
|
||||
""" Gets all zeros frames to use as first decoder input
|
||||
PARAMS
|
||||
------
|
||||
memory: decoder outputs
|
||||
|
||||
RETURNS
|
||||
-------
|
||||
decoder_input: all zeros frames
|
||||
"""
|
||||
B = memory.size(0)
|
||||
decoder_input = Variable(memory.data.new(
|
||||
B, self.n_mel_channels * self.n_frames_per_step).zero_())
|
||||
return decoder_input
|
||||
|
||||
def initialize_decoder_states(self, memory, mask):
|
||||
""" Initializes attention rnn states, decoder rnn states, attention
|
||||
weights, attention cumulative weights, attention context, stores memory
|
||||
and stores processed memory
|
||||
PARAMS
|
||||
------
|
||||
memory: Encoder outputs
|
||||
mask: Mask for padded data if training, expects None for inference
|
||||
"""
|
||||
B = memory.size(0)
|
||||
MAX_TIME = memory.size(1)
|
||||
|
||||
self.attention_hidden = Variable(memory.data.new(
|
||||
B, self.attention_rnn_dim).zero_())
|
||||
self.attention_cell = Variable(memory.data.new(
|
||||
B, self.attention_rnn_dim).zero_())
|
||||
|
||||
self.decoder_hidden = Variable(memory.data.new(
|
||||
B, self.decoder_rnn_dim).zero_())
|
||||
self.decoder_cell = Variable(memory.data.new(
|
||||
B, self.decoder_rnn_dim).zero_())
|
||||
|
||||
self.attention_weights = Variable(memory.data.new(
|
||||
B, MAX_TIME).zero_())
|
||||
self.attention_weights_cum = Variable(memory.data.new(
|
||||
B, MAX_TIME).zero_())
|
||||
self.attention_context = Variable(memory.data.new(
|
||||
B, self.encoder_embedding_dim).zero_())
|
||||
|
||||
self.memory = memory
|
||||
self.processed_memory = self.attention_layer.memory_layer(memory)
|
||||
self.mask = mask
|
||||
|
||||
def parse_decoder_inputs(self, decoder_inputs):
|
||||
""" Prepares decoder inputs, i.e. mel outputs
|
||||
PARAMS
|
||||
------
|
||||
decoder_inputs: inputs used for teacher-forced training, i.e. mel-specs
|
||||
|
||||
RETURNS
|
||||
-------
|
||||
inputs: processed decoder inputs
|
||||
|
||||
"""
|
||||
# (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels)
|
||||
decoder_inputs = decoder_inputs.transpose(1, 2)
|
||||
decoder_inputs = decoder_inputs.view(
|
||||
decoder_inputs.size(0),
|
||||
int(decoder_inputs.size(1)/self.n_frames_per_step), -1)
|
||||
# (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels)
|
||||
decoder_inputs = decoder_inputs.transpose(0, 1)
|
||||
return decoder_inputs
|
||||
|
||||
def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments):
|
||||
""" Prepares decoder outputs for output
|
||||
PARAMS
|
||||
------
|
||||
mel_outputs:
|
||||
gate_outputs: gate output energies
|
||||
alignments:
|
||||
|
||||
RETURNS
|
||||
-------
|
||||
mel_outputs:
|
||||
gate_outpust: gate output energies
|
||||
alignments:
|
||||
"""
|
||||
# (T_out, B) -> (B, T_out)
|
||||
alignments = torch.stack(alignments).transpose(0, 1)
|
||||
# (T_out, B) -> (B, T_out)
|
||||
gate_outputs = torch.stack(gate_outputs).transpose(0, 1)
|
||||
gate_outputs = gate_outputs.contiguous()
|
||||
# (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels)
|
||||
mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous()
|
||||
# decouple frames per step
|
||||
mel_outputs = mel_outputs.view(
|
||||
mel_outputs.size(0), -1, self.n_mel_channels)
|
||||
# (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out)
|
||||
mel_outputs = mel_outputs.transpose(1, 2)
|
||||
|
||||
return mel_outputs, gate_outputs, alignments
|
||||
|
||||
def decode(self, decoder_input):
|
||||
""" Decoder step using stored states, attention and memory
|
||||
PARAMS
|
||||
------
|
||||
decoder_input: previous mel output
|
||||
|
||||
RETURNS
|
||||
-------
|
||||
mel_output:
|
||||
gate_output: gate output energies
|
||||
attention_weights:
|
||||
"""
|
||||
cell_input = torch.cat((decoder_input, self.attention_context), -1)
|
||||
self.attention_hidden, self.attention_cell = self.attention_rnn(
|
||||
cell_input, (self.attention_hidden, self.attention_cell))
|
||||
self.attention_hidden = F.dropout(
|
||||
self.attention_hidden, self.p_attention_dropout, self.training)
|
||||
|
||||
attention_weights_cat = torch.cat(
|
||||
(self.attention_weights.unsqueeze(1),
|
||||
self.attention_weights_cum.unsqueeze(1)), dim=1)
|
||||
self.attention_context, self.attention_weights = self.attention_layer(
|
||||
self.attention_hidden, self.memory, self.processed_memory,
|
||||
attention_weights_cat, self.mask)
|
||||
|
||||
self.attention_weights_cum += self.attention_weights
|
||||
decoder_input = torch.cat(
|
||||
(self.attention_hidden, self.attention_context), -1)
|
||||
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
|
||||
decoder_input, (self.decoder_hidden, self.decoder_cell))
|
||||
self.decoder_hidden = F.dropout(
|
||||
self.decoder_hidden, self.p_decoder_dropout, self.training)
|
||||
|
||||
decoder_hidden_attention_context = torch.cat(
|
||||
(self.decoder_hidden, self.attention_context), dim=1)
|
||||
decoder_output = self.linear_projection(
|
||||
decoder_hidden_attention_context)
|
||||
|
||||
gate_prediction = self.gate_layer(decoder_hidden_attention_context)
|
||||
return decoder_output, gate_prediction, self.attention_weights
|
||||
|
||||
def forward(self, memory, decoder_inputs, memory_lengths):
|
||||
""" Decoder forward pass for training
|
||||
PARAMS
|
||||
------
|
||||
memory: Encoder outputs
|
||||
decoder_inputs: Decoder inputs for teacher forcing. i.e. mel-specs
|
||||
memory_lengths: Encoder output lengths for attention masking.
|
||||
|
||||
RETURNS
|
||||
-------
|
||||
mel_outputs: mel outputs from the decoder
|
||||
gate_outputs: gate outputs from the decoder
|
||||
alignments: sequence of attention weights from the decoder
|
||||
"""
|
||||
|
||||
decoder_input = self.get_go_frame(memory).unsqueeze(0)
|
||||
decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
|
||||
decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
|
||||
decoder_inputs = self.prenet(decoder_inputs)
|
||||
|
||||
self.initialize_decoder_states(
|
||||
memory, mask=~get_mask_from_lengths(memory_lengths))
|
||||
|
||||
mel_outputs, gate_outputs, alignments = [], [], []
|
||||
while len(mel_outputs) < decoder_inputs.size(0) - 1:
|
||||
decoder_input = decoder_inputs[len(mel_outputs)]
|
||||
mel_output, gate_output, attention_weights = self.decode(
|
||||
decoder_input)
|
||||
mel_outputs += [mel_output.squeeze(1)]
|
||||
gate_outputs += [gate_output.squeeze()]
|
||||
alignments += [attention_weights]
|
||||
|
||||
mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
|
||||
mel_outputs, gate_outputs, alignments)
|
||||
|
||||
return mel_outputs, gate_outputs, alignments
|
||||
|
||||
def inference(self, memory):
|
||||
""" Decoder inference
|
||||
PARAMS
|
||||
------
|
||||
memory: Encoder outputs
|
||||
|
||||
RETURNS
|
||||
-------
|
||||
mel_outputs: mel outputs from the decoder
|
||||
gate_outputs: gate outputs from the decoder
|
||||
alignments: sequence of attention weights from the decoder
|
||||
"""
|
||||
decoder_input = self.get_go_frame(memory)
|
||||
|
||||
self.initialize_decoder_states(memory, mask=None)
|
||||
|
||||
mel_outputs, gate_outputs, alignments = [], [], []
|
||||
while True:
|
||||
decoder_input = self.prenet(decoder_input)
|
||||
mel_output, gate_output, alignment = self.decode(decoder_input)
|
||||
|
||||
mel_outputs += [mel_output.squeeze(1)]
|
||||
gate_outputs += [gate_output]
|
||||
alignments += [alignment]
|
||||
|
||||
if torch.sigmoid(gate_output.data) > self.gate_threshold:
|
||||
break
|
||||
elif len(mel_outputs) == self.max_decoder_steps:
|
||||
print("Warning! Reached max decoder steps")
|
||||
break
|
||||
|
||||
decoder_input = mel_output
|
||||
|
||||
mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
|
||||
mel_outputs, gate_outputs, alignments)
|
||||
|
||||
return mel_outputs, gate_outputs, alignments
|
||||
|
||||
|
||||
class Tacotron2(nn.Module):
|
||||
def __init__(self, hparams):
|
||||
super(Tacotron2, self).__init__()
|
||||
self.mask_padding = hparams.mask_padding
|
||||
self.fp16_run = hparams.fp16_run
|
||||
self.n_mel_channels = hparams.n_mel_channels
|
||||
self.n_frames_per_step = hparams.n_frames_per_step
|
||||
self.embedding = nn.Embedding(
|
||||
hparams.n_symbols, hparams.symbols_embedding_dim)
|
||||
std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))
|
||||
val = sqrt(3.0) * std # uniform bounds for std
|
||||
self.embedding.weight.data.uniform_(-val, val)
|
||||
self.encoder = Encoder(hparams)
|
||||
self.decoder = Decoder(hparams)
|
||||
self.postnet = Postnet(hparams)
|
||||
|
||||
def parse_batch(self, batch):
|
||||
text_padded, input_lengths, mel_padded, gate_padded, \
|
||||
output_lengths = batch
|
||||
text_padded = to_gpu(text_padded).long()
|
||||
input_lengths = to_gpu(input_lengths).long()
|
||||
max_len = torch.max(input_lengths.data).item()
|
||||
mel_padded = to_gpu(mel_padded).float()
|
||||
gate_padded = to_gpu(gate_padded).float()
|
||||
output_lengths = to_gpu(output_lengths).long()
|
||||
|
||||
return (
|
||||
(text_padded, input_lengths, mel_padded, max_len, output_lengths),
|
||||
(mel_padded, gate_padded))
|
||||
|
||||
def parse_output(self, outputs, output_lengths=None):
|
||||
if self.mask_padding and output_lengths is not None:
|
||||
mask = ~get_mask_from_lengths(output_lengths)
|
||||
mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
|
||||
mask = mask.permute(1, 0, 2)
|
||||
|
||||
outputs[0].data.masked_fill_(mask, 0.0)
|
||||
outputs[1].data.masked_fill_(mask, 0.0)
|
||||
outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies
|
||||
|
||||
return outputs
|
||||
|
||||
def forward(self, inputs):
|
||||
text_inputs, text_lengths, mels, max_len, output_lengths = inputs
|
||||
text_lengths, output_lengths = text_lengths.data, output_lengths.data
|
||||
|
||||
embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
|
||||
|
||||
encoder_outputs = self.encoder(embedded_inputs, text_lengths)
|
||||
|
||||
mel_outputs, gate_outputs, alignments = self.decoder(
|
||||
encoder_outputs, mels, memory_lengths=text_lengths)
|
||||
|
||||
mel_outputs_postnet = self.postnet(mel_outputs)
|
||||
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
|
||||
|
||||
return self.parse_output(
|
||||
[mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
|
||||
output_lengths)
|
||||
|
||||
def inference(self, inputs):
|
||||
embedded_inputs = self.embedding(inputs).transpose(1, 2)
|
||||
encoder_outputs = self.encoder.inference(embedded_inputs)
|
||||
mel_outputs, gate_outputs, alignments = self.decoder.inference(
|
||||
encoder_outputs)
|
||||
|
||||
mel_outputs_postnet = self.postnet(mel_outputs)
|
||||
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
|
||||
|
||||
outputs = self.parse_output(
|
||||
[mel_outputs, mel_outputs_postnet, gate_outputs, alignments])
|
||||
|
||||
return outputs
|
||||
|
|
@ -1,61 +0,0 @@
|
|||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pylab as plt
|
||||
import numpy as np
|
||||
|
||||
|
||||
def save_figure_to_numpy(fig):
|
||||
# save it to a numpy array.
|
||||
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
|
||||
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
|
||||
return data
|
||||
|
||||
|
||||
def plot_alignment_to_numpy(alignment, info=None):
|
||||
fig, ax = plt.subplots(figsize=(6, 4))
|
||||
im = ax.imshow(alignment, aspect='auto', origin='lower',
|
||||
interpolation='none')
|
||||
fig.colorbar(im, ax=ax)
|
||||
xlabel = 'Decoder timestep'
|
||||
if info is not None:
|
||||
xlabel += '\n\n' + info
|
||||
plt.xlabel(xlabel)
|
||||
plt.ylabel('Encoder timestep')
|
||||
plt.tight_layout()
|
||||
|
||||
fig.canvas.draw()
|
||||
data = save_figure_to_numpy(fig)
|
||||
plt.close()
|
||||
return data
|
||||
|
||||
|
||||
def plot_spectrogram_to_numpy(spectrogram):
|
||||
fig, ax = plt.subplots(figsize=(12, 3))
|
||||
im = ax.imshow(spectrogram, aspect="auto", origin="lower",
|
||||
interpolation='none')
|
||||
plt.colorbar(im, ax=ax)
|
||||
plt.xlabel("Frames")
|
||||
plt.ylabel("Channels")
|
||||
plt.tight_layout()
|
||||
|
||||
fig.canvas.draw()
|
||||
data = save_figure_to_numpy(fig)
|
||||
plt.close()
|
||||
return data
|
||||
|
||||
|
||||
def plot_gate_outputs_to_numpy(gate_targets, gate_outputs):
|
||||
fig, ax = plt.subplots(figsize=(12, 3))
|
||||
ax.scatter(range(len(gate_targets)), gate_targets, alpha=0.5,
|
||||
color='green', marker='+', s=1, label='target')
|
||||
ax.scatter(range(len(gate_outputs)), gate_outputs, alpha=0.5,
|
||||
color='red', marker='.', s=1, label='predicted')
|
||||
|
||||
plt.xlabel("Frames (Green target, Red predicted)")
|
||||
plt.ylabel("Gate State")
|
||||
plt.tight_layout()
|
||||
|
||||
fig.canvas.draw()
|
||||
data = save_figure_to_numpy(fig)
|
||||
plt.close()
|
||||
return data
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
tensorflow
|
||||
numpy
|
||||
inflect==0.2.5
|
||||
librosa==0.6.0
|
||||
scipy
|
||||
tensorboardX
|
||||
Unidecode==1.0.22
|
||||
pillow
|
||||
torch==1.1.0
|
||||
pysoundfile
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
pip==18.1
|
||||
bumpversion==0.5.3
|
||||
wheel==0.32.1
|
||||
watchdog==0.9.0
|
||||
flake8==3.5.0
|
||||
tox==3.5.2
|
||||
coverage==4.5.1
|
||||
Sphinx==1.8.1
|
||||
twine==1.12.1
|
||||
|
||||
pytest==3.8.2
|
||||
pytest-runner==4.2
|
||||
pre-commit==1.16.1
|
||||
python-language-server[all]
|
||||
ipdb
|
||||
39
server.py
39
server.py
|
|
@ -1,39 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import grpc
|
||||
import time
|
||||
from sia.proto import tts_pb2
|
||||
from sia.proto import tts_pb2_grpc
|
||||
from concurrent import futures
|
||||
from sia.instruments import do_time
|
||||
from tts import TTSModel
|
||||
|
||||
|
||||
class TTSServer():
|
||||
def __init__(self):
|
||||
self.tts_model = TTSModel()
|
||||
|
||||
def TextToSpeechAPI(self, request, context):
|
||||
while (True):
|
||||
input_text = request.text
|
||||
speech_response = self.tts_model.synth_speech(input_text)
|
||||
return tts_pb2.SpeechResponse(response=speech_response)
|
||||
|
||||
|
||||
def main():
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
|
||||
tts_server = TTSServer()
|
||||
tts_pb2_grpc.add_ServerServicer_to_server(tts_server, server)
|
||||
server.add_insecure_port('localhost:50060')
|
||||
server.start()
|
||||
print('TTSServer started!')
|
||||
|
||||
try:
|
||||
while True:
|
||||
time.sleep(10000)
|
||||
except KeyboardInterrupt:
|
||||
server.start()
|
||||
# server.stop(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
141
stft.py
141
stft.py
|
|
@ -1,141 +0,0 @@
|
|||
"""
|
||||
BSD 3-Clause License
|
||||
|
||||
Copyright (c) 2017, Prem Seetharaman
|
||||
All rights reserved.
|
||||
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice, this
|
||||
list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from this
|
||||
software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
"""
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
import torch.nn.functional as F
|
||||
from torch.autograd import Variable
|
||||
from scipy.signal import get_window
|
||||
from librosa.util import pad_center, tiny
|
||||
from audio_processing import window_sumsquare
|
||||
|
||||
|
||||
class STFT(torch.nn.Module):
|
||||
"""adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
|
||||
def __init__(self, filter_length=800, hop_length=200, win_length=800,
|
||||
window='hann'):
|
||||
super(STFT, self).__init__()
|
||||
self.filter_length = filter_length
|
||||
self.hop_length = hop_length
|
||||
self.win_length = win_length
|
||||
self.window = window
|
||||
self.forward_transform = None
|
||||
scale = self.filter_length / self.hop_length
|
||||
fourier_basis = np.fft.fft(np.eye(self.filter_length))
|
||||
|
||||
cutoff = int((self.filter_length / 2 + 1))
|
||||
fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
|
||||
np.imag(fourier_basis[:cutoff, :])])
|
||||
|
||||
forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
|
||||
inverse_basis = torch.FloatTensor(
|
||||
np.linalg.pinv(scale * fourier_basis).T[:, None, :])
|
||||
|
||||
if window is not None:
|
||||
assert(filter_length >= win_length)
|
||||
# get window and zero center pad it to filter_length
|
||||
fft_window = get_window(window, win_length, fftbins=True)
|
||||
fft_window = pad_center(fft_window, filter_length)
|
||||
fft_window = torch.from_numpy(fft_window).float()
|
||||
|
||||
# window the bases
|
||||
forward_basis *= fft_window
|
||||
inverse_basis *= fft_window
|
||||
|
||||
self.register_buffer('forward_basis', forward_basis.float())
|
||||
self.register_buffer('inverse_basis', inverse_basis.float())
|
||||
|
||||
def transform(self, input_data):
|
||||
num_batches = input_data.size(0)
|
||||
num_samples = input_data.size(1)
|
||||
|
||||
self.num_samples = num_samples
|
||||
|
||||
# similar to librosa, reflect-pad the input
|
||||
input_data = input_data.view(num_batches, 1, num_samples)
|
||||
input_data = F.pad(
|
||||
input_data.unsqueeze(1),
|
||||
(int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
|
||||
mode='reflect')
|
||||
input_data = input_data.squeeze(1)
|
||||
|
||||
forward_transform = F.conv1d(
|
||||
input_data,
|
||||
Variable(self.forward_basis, requires_grad=False),
|
||||
stride=self.hop_length,
|
||||
padding=0)
|
||||
|
||||
cutoff = int((self.filter_length / 2) + 1)
|
||||
real_part = forward_transform[:, :cutoff, :]
|
||||
imag_part = forward_transform[:, cutoff:, :]
|
||||
|
||||
magnitude = torch.sqrt(real_part**2 + imag_part**2)
|
||||
phase = torch.autograd.Variable(
|
||||
torch.atan2(imag_part.data, real_part.data))
|
||||
|
||||
return magnitude, phase
|
||||
|
||||
def inverse(self, magnitude, phase):
|
||||
recombine_magnitude_phase = torch.cat(
|
||||
[magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
|
||||
|
||||
inverse_transform = F.conv_transpose1d(
|
||||
recombine_magnitude_phase,
|
||||
Variable(self.inverse_basis, requires_grad=False),
|
||||
stride=self.hop_length,
|
||||
padding=0)
|
||||
|
||||
if self.window is not None:
|
||||
window_sum = window_sumsquare(
|
||||
self.window, magnitude.size(-1), hop_length=self.hop_length,
|
||||
win_length=self.win_length, n_fft=self.filter_length,
|
||||
dtype=np.float32)
|
||||
# remove modulation effects
|
||||
approx_nonzero_indices = torch.from_numpy(
|
||||
np.where(window_sum > tiny(window_sum))[0])
|
||||
window_sum = torch.autograd.Variable(
|
||||
torch.from_numpy(window_sum), requires_grad=False)
|
||||
#window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum #initially not commented out
|
||||
inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
|
||||
|
||||
# scale by hop ratio
|
||||
inverse_transform *= float(self.filter_length) / self.hop_length
|
||||
|
||||
inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
|
||||
inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
|
||||
|
||||
return inverse_transform
|
||||
|
||||
def forward(self, input_data):
|
||||
self.magnitude, self.phase = self.transform(input_data)
|
||||
reconstruction = self.inverse(self.magnitude, self.phase)
|
||||
return reconstruction
|
||||
|
|
@ -1,22 +1,23 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
""" from https://github.com/keithito/tacotron """
|
||||
import re
|
||||
from text import cleaners
|
||||
from text.symbols import symbols
|
||||
|
||||
from . import cleaners
|
||||
from .symbols import symbols
|
||||
|
||||
# Mappings from symbol to numeric ID and vice versa:
|
||||
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
||||
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
||||
|
||||
# Regular expression matching text enclosed in curly braces:
|
||||
_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
|
||||
_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
|
||||
|
||||
|
||||
def text_to_sequence(text, cleaner_names):
|
||||
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
||||
"""Converts a string of text to a sequence of IDs corresponding to the
|
||||
symbols in the text.
|
||||
|
||||
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
|
||||
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
|
||||
The text can optionally have ARPAbet sequences enclosed in curly braces
|
||||
embedded in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
|
||||
|
||||
Args:
|
||||
text: string to convert to a sequence
|
||||
|
|
@ -24,51 +25,53 @@ def text_to_sequence(text, cleaner_names):
|
|||
|
||||
Returns:
|
||||
List of integers corresponding to the symbols in the text
|
||||
'''
|
||||
sequence = []
|
||||
"""
|
||||
sequence = []
|
||||
|
||||
# Check for curly braces and treat their contents as ARPAbet:
|
||||
while len(text):
|
||||
m = _curly_re.match(text)
|
||||
if not m:
|
||||
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
|
||||
break
|
||||
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
|
||||
sequence += _arpabet_to_sequence(m.group(2))
|
||||
text = m.group(3)
|
||||
# Check for curly braces and treat their contents as ARPAbet:
|
||||
while len(text):
|
||||
m = _curly_re.match(text)
|
||||
if not m:
|
||||
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
|
||||
break
|
||||
sequence += _symbols_to_sequence(
|
||||
_clean_text(m.group(1), cleaner_names)
|
||||
)
|
||||
sequence += _arpabet_to_sequence(m.group(2))
|
||||
text = m.group(3)
|
||||
|
||||
return sequence
|
||||
return sequence
|
||||
|
||||
|
||||
def sequence_to_text(sequence):
|
||||
'''Converts a sequence of IDs back to a string'''
|
||||
result = ''
|
||||
for symbol_id in sequence:
|
||||
if symbol_id in _id_to_symbol:
|
||||
s = _id_to_symbol[symbol_id]
|
||||
# Enclose ARPAbet back in curly braces:
|
||||
if len(s) > 1 and s[0] == '@':
|
||||
s = '{%s}' % s[1:]
|
||||
result += s
|
||||
return result.replace('}{', ' ')
|
||||
"""Converts a sequence of IDs back to a string"""
|
||||
result = ""
|
||||
for symbol_id in sequence:
|
||||
if symbol_id in _id_to_symbol:
|
||||
s = _id_to_symbol[symbol_id]
|
||||
# Enclose ARPAbet back in curly braces:
|
||||
if len(s) > 1 and s[0] == "@":
|
||||
s = "{%s}" % s[1:]
|
||||
result += s
|
||||
return result.replace("}{", " ")
|
||||
|
||||
|
||||
def _clean_text(text, cleaner_names):
|
||||
for name in cleaner_names:
|
||||
cleaner = getattr(cleaners, name)
|
||||
if not cleaner:
|
||||
raise Exception('Unknown cleaner: %s' % name)
|
||||
text = cleaner(text)
|
||||
return text
|
||||
for name in cleaner_names:
|
||||
cleaner = getattr(cleaners, name)
|
||||
if not cleaner:
|
||||
raise Exception("Unknown cleaner: %s" % name)
|
||||
text = cleaner(text)
|
||||
return text
|
||||
|
||||
|
||||
def _symbols_to_sequence(symbols):
|
||||
return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
|
||||
return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
|
||||
|
||||
|
||||
def _arpabet_to_sequence(text):
|
||||
return _symbols_to_sequence(['@' + s for s in text.split()])
|
||||
return _symbols_to_sequence(["@" + s for s in text.split()])
|
||||
|
||||
|
||||
def _should_keep_symbol(s):
|
||||
return s in _symbol_to_id and s is not '_' and s is not '~'
|
||||
return s in _symbol_to_id and s != "_" and s != "~"
|
||||
|
|
|
|||
125
text/cleaners.py
125
text/cleaners.py
|
|
@ -1,90 +1,99 @@
|
|||
""" from https://github.com/keithito/tacotron """
|
||||
|
||||
'''
|
||||
Cleaners are transformations that run over the input text at both training and eval time.
|
||||
|
||||
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
||||
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
|
||||
1. "english_cleaners" for English text
|
||||
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
||||
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
||||
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
||||
the symbols in symbols.py to match your data).
|
||||
'''
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
from unidecode import unidecode
|
||||
from .numbers import normalize_numbers
|
||||
|
||||
""" from https://github.com/keithito/tacotron """
|
||||
"""
|
||||
Cleaners are transformations that run over the input text at both training and
|
||||
eval time.
|
||||
|
||||
Cleaners can be selected by passing a comma-delimited list of cleaner names as
|
||||
the "cleaners"
|
||||
hyperparameter. Some cleaners are English-specific. You'll typically want to
|
||||
use:
|
||||
1. "english_cleaners" for English text
|
||||
2. "transliteration_cleaners" for non-English text that can be transliterated
|
||||
to ASCII using
|
||||
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
||||
3. "basic_cleaners" if you do not want to transliterate (in this case, you
|
||||
should also update
|
||||
the symbols in symbols.py to match your data).
|
||||
"""
|
||||
|
||||
# Regular expression matching whitespace:
|
||||
_whitespace_re = re.compile(r'\s+')
|
||||
_whitespace_re = re.compile(r"\s+")
|
||||
|
||||
# List of (regular expression, replacement) pairs for abbreviations:
|
||||
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
|
||||
('mrs', 'misess'),
|
||||
('mr', 'mister'),
|
||||
('dr', 'doctor'),
|
||||
('st', 'saint'),
|
||||
('co', 'company'),
|
||||
('jr', 'junior'),
|
||||
('maj', 'major'),
|
||||
('gen', 'general'),
|
||||
('drs', 'doctors'),
|
||||
('rev', 'reverend'),
|
||||
('lt', 'lieutenant'),
|
||||
('hon', 'honorable'),
|
||||
('sgt', 'sergeant'),
|
||||
('capt', 'captain'),
|
||||
('esq', 'esquire'),
|
||||
('ltd', 'limited'),
|
||||
('col', 'colonel'),
|
||||
('ft', 'fort'),
|
||||
]]
|
||||
_abbreviations = [
|
||||
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
("mrs", "misess"),
|
||||
("mr", "mister"),
|
||||
("dr", "doctor"),
|
||||
("st", "saint"),
|
||||
("co", "company"),
|
||||
("jr", "junior"),
|
||||
("maj", "major"),
|
||||
("gen", "general"),
|
||||
("drs", "doctors"),
|
||||
("rev", "reverend"),
|
||||
("lt", "lieutenant"),
|
||||
("hon", "honorable"),
|
||||
("sgt", "sergeant"),
|
||||
("capt", "captain"),
|
||||
("esq", "esquire"),
|
||||
("ltd", "limited"),
|
||||
("col", "colonel"),
|
||||
("ft", "fort"),
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
def expand_abbreviations(text):
|
||||
for regex, replacement in _abbreviations:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
for regex, replacement in _abbreviations:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def expand_numbers(text):
|
||||
return normalize_numbers(text)
|
||||
return normalize_numbers(text)
|
||||
|
||||
|
||||
def lowercase(text):
|
||||
return text.lower()
|
||||
return text.lower()
|
||||
|
||||
|
||||
def collapse_whitespace(text):
|
||||
return re.sub(_whitespace_re, ' ', text)
|
||||
return re.sub(_whitespace_re, " ", text)
|
||||
|
||||
|
||||
def convert_to_ascii(text):
|
||||
return unidecode(text)
|
||||
return unidecode(text)
|
||||
|
||||
|
||||
def basic_cleaners(text):
|
||||
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
"""Basic pipeline that lowercases and collapses whitespace without
|
||||
transliteration."""
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def transliteration_cleaners(text):
|
||||
'''Pipeline for non-English text that transliterates to ASCII.'''
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
"""Pipeline for non-English text that transliterates to ASCII."""
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def english_cleaners(text):
|
||||
'''Pipeline for English text, including number and abbreviation expansion.'''
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = expand_numbers(text)
|
||||
text = expand_abbreviations(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
"""Pipeline for English text, including number and abbreviation
|
||||
expansion."""
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = expand_numbers(text)
|
||||
text = expand_abbreviations(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
|
|
|||
170
text/cmudict.py
170
text/cmudict.py
|
|
@ -1,65 +1,143 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
""" from https://github.com/keithito/tacotron """
|
||||
|
||||
import re
|
||||
|
||||
|
||||
valid_symbols = [
|
||||
'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
|
||||
'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
|
||||
'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
|
||||
'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
|
||||
'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
|
||||
'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
|
||||
'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
|
||||
"AA",
|
||||
"AA0",
|
||||
"AA1",
|
||||
"AA2",
|
||||
"AE",
|
||||
"AE0",
|
||||
"AE1",
|
||||
"AE2",
|
||||
"AH",
|
||||
"AH0",
|
||||
"AH1",
|
||||
"AH2",
|
||||
"AO",
|
||||
"AO0",
|
||||
"AO1",
|
||||
"AO2",
|
||||
"AW",
|
||||
"AW0",
|
||||
"AW1",
|
||||
"AW2",
|
||||
"AY",
|
||||
"AY0",
|
||||
"AY1",
|
||||
"AY2",
|
||||
"B",
|
||||
"CH",
|
||||
"D",
|
||||
"DH",
|
||||
"EH",
|
||||
"EH0",
|
||||
"EH1",
|
||||
"EH2",
|
||||
"ER",
|
||||
"ER0",
|
||||
"ER1",
|
||||
"ER2",
|
||||
"EY",
|
||||
"EY0",
|
||||
"EY1",
|
||||
"EY2",
|
||||
"F",
|
||||
"G",
|
||||
"HH",
|
||||
"IH",
|
||||
"IH0",
|
||||
"IH1",
|
||||
"IH2",
|
||||
"IY",
|
||||
"IY0",
|
||||
"IY1",
|
||||
"IY2",
|
||||
"JH",
|
||||
"K",
|
||||
"L",
|
||||
"M",
|
||||
"N",
|
||||
"NG",
|
||||
"OW",
|
||||
"OW0",
|
||||
"OW1",
|
||||
"OW2",
|
||||
"OY",
|
||||
"OY0",
|
||||
"OY1",
|
||||
"OY2",
|
||||
"P",
|
||||
"R",
|
||||
"S",
|
||||
"SH",
|
||||
"T",
|
||||
"TH",
|
||||
"UH",
|
||||
"UH0",
|
||||
"UH1",
|
||||
"UH2",
|
||||
"UW",
|
||||
"UW0",
|
||||
"UW1",
|
||||
"UW2",
|
||||
"V",
|
||||
"W",
|
||||
"Y",
|
||||
"Z",
|
||||
"ZH",
|
||||
]
|
||||
|
||||
_valid_symbol_set = set(valid_symbols)
|
||||
|
||||
|
||||
class CMUDict:
|
||||
'''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
|
||||
def __init__(self, file_or_path, keep_ambiguous=True):
|
||||
if isinstance(file_or_path, str):
|
||||
with open(file_or_path, encoding='latin-1') as f:
|
||||
entries = _parse_cmudict(f)
|
||||
else:
|
||||
entries = _parse_cmudict(file_or_path)
|
||||
if not keep_ambiguous:
|
||||
entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
|
||||
self._entries = entries
|
||||
"""Thin wrapper around CMUDict data.
|
||||
http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
|
||||
|
||||
def __init__(self, file_or_path, keep_ambiguous=True):
|
||||
if isinstance(file_or_path, str):
|
||||
with open(file_or_path, encoding="latin-1") as f:
|
||||
entries = _parse_cmudict(f)
|
||||
else:
|
||||
entries = _parse_cmudict(file_or_path)
|
||||
if not keep_ambiguous:
|
||||
entries = {
|
||||
word: pron for word, pron in entries.items() if len(pron) == 1
|
||||
}
|
||||
self._entries = entries
|
||||
|
||||
def __len__(self):
|
||||
return len(self._entries)
|
||||
|
||||
def lookup(self, word):
|
||||
"""Returns list of ARPAbet pronunciations of the given word."""
|
||||
return self._entries.get(word.upper())
|
||||
|
||||
|
||||
def __len__(self):
|
||||
return len(self._entries)
|
||||
|
||||
|
||||
def lookup(self, word):
|
||||
'''Returns list of ARPAbet pronunciations of the given word.'''
|
||||
return self._entries.get(word.upper())
|
||||
|
||||
|
||||
|
||||
_alt_re = re.compile(r'\([0-9]+\)')
|
||||
_alt_re = re.compile(r"\([0-9]+\)")
|
||||
|
||||
|
||||
def _parse_cmudict(file):
|
||||
cmudict = {}
|
||||
for line in file:
|
||||
if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
|
||||
parts = line.split(' ')
|
||||
word = re.sub(_alt_re, '', parts[0])
|
||||
pronunciation = _get_pronunciation(parts[1])
|
||||
if pronunciation:
|
||||
if word in cmudict:
|
||||
cmudict[word].append(pronunciation)
|
||||
else:
|
||||
cmudict[word] = [pronunciation]
|
||||
return cmudict
|
||||
cmudict = {}
|
||||
for line in file:
|
||||
if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
|
||||
parts = line.split(" ")
|
||||
word = re.sub(_alt_re, "", parts[0])
|
||||
pronunciation = _get_pronunciation(parts[1])
|
||||
if pronunciation:
|
||||
if word in cmudict:
|
||||
cmudict[word].append(pronunciation)
|
||||
else:
|
||||
cmudict[word] = [pronunciation]
|
||||
return cmudict
|
||||
|
||||
|
||||
def _get_pronunciation(s):
|
||||
parts = s.strip().split(' ')
|
||||
for part in parts:
|
||||
if part not in _valid_symbol_set:
|
||||
return None
|
||||
return ' '.join(parts)
|
||||
parts = s.strip().split(" ")
|
||||
for part in parts:
|
||||
if part not in _valid_symbol_set:
|
||||
return None
|
||||
return " ".join(parts)
|
||||
|
|
|
|||
|
|
@ -1,71 +1,73 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
""" from https://github.com/keithito/tacotron """
|
||||
|
||||
import inflect
|
||||
import re
|
||||
|
||||
|
||||
_inflect = inflect.engine()
|
||||
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
||||
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
||||
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
|
||||
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
|
||||
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
|
||||
_number_re = re.compile(r'[0-9]+')
|
||||
_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
|
||||
_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
|
||||
_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
|
||||
_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
|
||||
_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
|
||||
_number_re = re.compile(r"[0-9]+")
|
||||
|
||||
|
||||
def _remove_commas(m):
|
||||
return m.group(1).replace(',', '')
|
||||
return m.group(1).replace(",", "")
|
||||
|
||||
|
||||
def _expand_decimal_point(m):
|
||||
return m.group(1).replace('.', ' point ')
|
||||
return m.group(1).replace(".", " point ")
|
||||
|
||||
|
||||
def _expand_dollars(m):
|
||||
match = m.group(1)
|
||||
parts = match.split('.')
|
||||
if len(parts) > 2:
|
||||
return match + ' dollars' # Unexpected format
|
||||
dollars = int(parts[0]) if parts[0] else 0
|
||||
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
||||
if dollars and cents:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
|
||||
elif dollars:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
return '%s %s' % (dollars, dollar_unit)
|
||||
elif cents:
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s' % (cents, cent_unit)
|
||||
else:
|
||||
return 'zero dollars'
|
||||
match = m.group(1)
|
||||
parts = match.split(".")
|
||||
if len(parts) > 2:
|
||||
return match + " dollars" # Unexpected format
|
||||
dollars = int(parts[0]) if parts[0] else 0
|
||||
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
||||
if dollars and cents:
|
||||
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
||||
cent_unit = "cent" if cents == 1 else "cents"
|
||||
return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
|
||||
elif dollars:
|
||||
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
||||
return "%s %s" % (dollars, dollar_unit)
|
||||
elif cents:
|
||||
cent_unit = "cent" if cents == 1 else "cents"
|
||||
return "%s %s" % (cents, cent_unit)
|
||||
else:
|
||||
return "zero dollars"
|
||||
|
||||
|
||||
def _expand_ordinal(m):
|
||||
return _inflect.number_to_words(m.group(0))
|
||||
return _inflect.number_to_words(m.group(0))
|
||||
|
||||
|
||||
def _expand_number(m):
|
||||
num = int(m.group(0))
|
||||
if num > 1000 and num < 3000:
|
||||
if num == 2000:
|
||||
return 'two thousand'
|
||||
elif num > 2000 and num < 2010:
|
||||
return 'two thousand ' + _inflect.number_to_words(num % 100)
|
||||
elif num % 100 == 0:
|
||||
return _inflect.number_to_words(num // 100) + ' hundred'
|
||||
num = int(m.group(0))
|
||||
if num > 1000 and num < 3000:
|
||||
if num == 2000:
|
||||
return "two thousand"
|
||||
elif num > 2000 and num < 2010:
|
||||
return "two thousand " + _inflect.number_to_words(num % 100)
|
||||
elif num % 100 == 0:
|
||||
return _inflect.number_to_words(num // 100) + " hundred"
|
||||
else:
|
||||
return _inflect.number_to_words(
|
||||
num, andword="", zero="oh", group=2
|
||||
).replace(", ", " ")
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword='')
|
||||
return _inflect.number_to_words(num, andword="")
|
||||
|
||||
|
||||
def normalize_numbers(text):
|
||||
text = re.sub(_comma_number_re, _remove_commas, text)
|
||||
text = re.sub(_pounds_re, r'\1 pounds', text)
|
||||
text = re.sub(_dollars_re, _expand_dollars, text)
|
||||
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
||||
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
||||
text = re.sub(_number_re, _expand_number, text)
|
||||
return text
|
||||
text = re.sub(_comma_number_re, _remove_commas, text)
|
||||
text = re.sub(_pounds_re, r"\1 pounds", text)
|
||||
text = re.sub(_dollars_re, _expand_dollars, text)
|
||||
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
||||
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
||||
text = re.sub(_number_re, _expand_number, text)
|
||||
return text
|
||||
|
|
|
|||
|
|
@ -1,18 +1,24 @@
|
|||
""" from https://github.com/keithito/tacotron """
|
||||
# -*- coding: utf-8 -*-
|
||||
from . import cmudict
|
||||
|
||||
'''
|
||||
""" from https://github.com/keithito/tacotron """
|
||||
"""
|
||||
Defines the set of symbols used in text input to the model.
|
||||
|
||||
The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
|
||||
from text import cmudict
|
||||
The default is a set of ASCII characters that works well for English or text
|
||||
that has been run through Unidecode. For other data, you can modify
|
||||
_characters. See TRAINING_DATA.md for details. """
|
||||
|
||||
_pad = '_'
|
||||
_punctuation = '!\'(),.:;? '
|
||||
_special = '-'
|
||||
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
|
||||
_pad = "_"
|
||||
_punctuation = "!'(),.:;? "
|
||||
_special = "-"
|
||||
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
||||
|
||||
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
||||
_arpabet = ['@' + s for s in cmudict.valid_symbols]
|
||||
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as
|
||||
# uppercase letters):
|
||||
_arpabet = ["@" + s for s in cmudict.valid_symbols]
|
||||
|
||||
# Export all symbols:
|
||||
symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet
|
||||
symbols = (
|
||||
[_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet
|
||||
)
|
||||
|
|
|
|||
290
train.py
290
train.py
|
|
@ -1,290 +0,0 @@
|
|||
import os
|
||||
import time
|
||||
import argparse
|
||||
import math
|
||||
from numpy import finfo
|
||||
|
||||
import torch
|
||||
from distributed import apply_gradient_allreduce
|
||||
import torch.distributed as dist
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from model import Tacotron2
|
||||
from data_utils import TextMelLoader, TextMelCollate
|
||||
from loss_function import Tacotron2Loss
|
||||
from logger import Tacotron2Logger
|
||||
from hparams import create_hparams
|
||||
|
||||
|
||||
def reduce_tensor(tensor, n_gpus):
|
||||
rt = tensor.clone()
|
||||
dist.all_reduce(rt, op=dist.reduce_op.SUM)
|
||||
rt /= n_gpus
|
||||
return rt
|
||||
|
||||
|
||||
def init_distributed(hparams, n_gpus, rank, group_name):
|
||||
assert torch.cuda.is_available(), "Distributed mode requires CUDA."
|
||||
print("Initializing Distributed")
|
||||
|
||||
# Set cuda device so everything is done on the right GPU.
|
||||
torch.cuda.set_device(rank % torch.cuda.device_count())
|
||||
|
||||
# Initialize distributed communication
|
||||
dist.init_process_group(
|
||||
backend=hparams.dist_backend, init_method=hparams.dist_url,
|
||||
world_size=n_gpus, rank=rank, group_name=group_name)
|
||||
|
||||
print("Done initializing distributed")
|
||||
|
||||
|
||||
def prepare_dataloaders(hparams):
|
||||
# Get data, data loaders and collate function ready
|
||||
trainset = TextMelLoader(hparams.training_files, hparams)
|
||||
valset = TextMelLoader(hparams.validation_files, hparams)
|
||||
collate_fn = TextMelCollate(hparams.n_frames_per_step)
|
||||
|
||||
if hparams.distributed_run:
|
||||
train_sampler = DistributedSampler(trainset)
|
||||
shuffle = False
|
||||
else:
|
||||
train_sampler = None
|
||||
shuffle = True
|
||||
|
||||
train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle,
|
||||
sampler=train_sampler,
|
||||
batch_size=hparams.batch_size, pin_memory=False,
|
||||
drop_last=True, collate_fn=collate_fn)
|
||||
return train_loader, valset, collate_fn
|
||||
|
||||
|
||||
def prepare_directories_and_logger(output_directory, log_directory, rank):
|
||||
if rank == 0:
|
||||
if not os.path.isdir(output_directory):
|
||||
os.makedirs(output_directory)
|
||||
os.chmod(output_directory, 0o775)
|
||||
logger = Tacotron2Logger(os.path.join(output_directory, log_directory))
|
||||
else:
|
||||
logger = None
|
||||
return logger
|
||||
|
||||
|
||||
def load_model(hparams):
|
||||
model = Tacotron2(hparams)
|
||||
if hparams.fp16_run:
|
||||
model.decoder.attention_layer.score_mask_value = finfo('float16').min
|
||||
|
||||
if hparams.distributed_run:
|
||||
model = apply_gradient_allreduce(model)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def warm_start_model(checkpoint_path, model, ignore_layers):
|
||||
assert os.path.isfile(checkpoint_path)
|
||||
print("Warm starting model from checkpoint '{}'".format(checkpoint_path))
|
||||
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
|
||||
model_dict = checkpoint_dict['state_dict']
|
||||
if len(ignore_layers) > 0:
|
||||
model_dict = {k: v for k, v in model_dict.items()
|
||||
if k not in ignore_layers}
|
||||
dummy_dict = model.state_dict()
|
||||
dummy_dict.update(model_dict)
|
||||
model_dict = dummy_dict
|
||||
model.load_state_dict(model_dict)
|
||||
return model
|
||||
|
||||
|
||||
def load_checkpoint(checkpoint_path, model, optimizer):
|
||||
assert os.path.isfile(checkpoint_path)
|
||||
print("Loading checkpoint '{}'".format(checkpoint_path))
|
||||
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
|
||||
model.load_state_dict(checkpoint_dict['state_dict'])
|
||||
optimizer.load_state_dict(checkpoint_dict['optimizer'])
|
||||
learning_rate = checkpoint_dict['learning_rate']
|
||||
iteration = checkpoint_dict['iteration']
|
||||
print("Loaded checkpoint '{}' from iteration {}" .format(
|
||||
checkpoint_path, iteration))
|
||||
return model, optimizer, learning_rate, iteration
|
||||
|
||||
|
||||
def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
|
||||
print("Saving model and optimizer state at iteration {} to {}".format(
|
||||
iteration, filepath))
|
||||
torch.save({'iteration': iteration,
|
||||
'state_dict': model.state_dict(),
|
||||
'optimizer': optimizer.state_dict(),
|
||||
'learning_rate': learning_rate}, filepath)
|
||||
|
||||
|
||||
def validate(model, criterion, valset, iteration, batch_size, n_gpus,
|
||||
collate_fn, logger, distributed_run, rank):
|
||||
"""Handles all the validation scoring and printing"""
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
val_sampler = DistributedSampler(valset) if distributed_run else None
|
||||
val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1,
|
||||
shuffle=False, batch_size=batch_size,
|
||||
pin_memory=False, collate_fn=collate_fn)
|
||||
|
||||
val_loss = 0.0
|
||||
for i, batch in enumerate(val_loader):
|
||||
x, y = model.parse_batch(batch)
|
||||
y_pred = model(x)
|
||||
loss = criterion(y_pred, y)
|
||||
if distributed_run:
|
||||
reduced_val_loss = reduce_tensor(loss.data, n_gpus).item()
|
||||
else:
|
||||
reduced_val_loss = loss.item()
|
||||
val_loss += reduced_val_loss
|
||||
val_loss = val_loss / (i + 1)
|
||||
|
||||
model.train()
|
||||
if rank == 0:
|
||||
print("Validation loss {}: {:9f} ".format(iteration, reduced_val_loss))
|
||||
logger.log_validation(reduced_val_loss, model, y, y_pred, iteration)
|
||||
|
||||
|
||||
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
|
||||
rank, group_name, hparams):
|
||||
"""Training and validation logging results to tensorboard and stdout
|
||||
|
||||
Params
|
||||
------
|
||||
output_directory (string): directory to save checkpoints
|
||||
log_directory (string) directory to save tensorboard logs
|
||||
checkpoint_path(string): checkpoint path
|
||||
n_gpus (int): number of gpus
|
||||
rank (int): rank of current gpu
|
||||
hparams (object): comma separated list of "name=value" pairs.
|
||||
"""
|
||||
if hparams.distributed_run:
|
||||
init_distributed(hparams, n_gpus, rank, group_name)
|
||||
|
||||
torch.manual_seed(hparams.seed)
|
||||
torch.cuda.manual_seed(hparams.seed)
|
||||
|
||||
model = load_model(hparams)
|
||||
learning_rate = hparams.learning_rate
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
|
||||
weight_decay=hparams.weight_decay)
|
||||
|
||||
if hparams.fp16_run:
|
||||
from apex import amp
|
||||
model, optimizer = amp.initialize(
|
||||
model, optimizer, opt_level='O2')
|
||||
|
||||
if hparams.distributed_run:
|
||||
model = apply_gradient_allreduce(model)
|
||||
|
||||
criterion = Tacotron2Loss()
|
||||
|
||||
logger = prepare_directories_and_logger(
|
||||
output_directory, log_directory, rank)
|
||||
|
||||
train_loader, valset, collate_fn = prepare_dataloaders(hparams)
|
||||
|
||||
# Load checkpoint if one exists
|
||||
iteration = 0
|
||||
epoch_offset = 0
|
||||
if checkpoint_path is not None:
|
||||
if warm_start:
|
||||
model = warm_start_model(
|
||||
checkpoint_path, model, hparams.ignore_layers)
|
||||
else:
|
||||
model, optimizer, _learning_rate, iteration = load_checkpoint(
|
||||
checkpoint_path, model, optimizer)
|
||||
if hparams.use_saved_learning_rate:
|
||||
learning_rate = _learning_rate
|
||||
iteration += 1 # next iteration is iteration + 1
|
||||
epoch_offset = max(0, int(iteration / len(train_loader)))
|
||||
|
||||
model.train()
|
||||
is_overflow = False
|
||||
# ================ MAIN TRAINNIG LOOP! ===================
|
||||
for epoch in range(epoch_offset, hparams.epochs):
|
||||
print("Epoch: {}".format(epoch))
|
||||
for i, batch in enumerate(train_loader):
|
||||
start = time.perf_counter()
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = learning_rate
|
||||
|
||||
model.zero_grad()
|
||||
x, y = model.parse_batch(batch)
|
||||
y_pred = model(x)
|
||||
|
||||
loss = criterion(y_pred, y)
|
||||
if hparams.distributed_run:
|
||||
reduced_loss = reduce_tensor(loss.data, n_gpus).item()
|
||||
else:
|
||||
reduced_loss = loss.item()
|
||||
if hparams.fp16_run:
|
||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
loss.backward()
|
||||
|
||||
if hparams.fp16_run:
|
||||
grad_norm = torch.nn.utils.clip_grad_norm_(
|
||||
amp.master_params(optimizer), hparams.grad_clip_thresh)
|
||||
is_overflow = math.isnan(grad_norm)
|
||||
else:
|
||||
grad_norm = torch.nn.utils.clip_grad_norm_(
|
||||
model.parameters(), hparams.grad_clip_thresh)
|
||||
|
||||
optimizer.step()
|
||||
|
||||
if not is_overflow and rank == 0:
|
||||
duration = time.perf_counter() - start
|
||||
print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
|
||||
iteration, reduced_loss, grad_norm, duration))
|
||||
logger.log_training(
|
||||
reduced_loss, grad_norm, learning_rate, duration, iteration)
|
||||
|
||||
if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0):
|
||||
validate(model, criterion, valset, iteration,
|
||||
hparams.batch_size, n_gpus, collate_fn, logger,
|
||||
hparams.distributed_run, rank)
|
||||
if rank == 0:
|
||||
checkpoint_path = os.path.join(
|
||||
output_directory, "checkpoint_{}".format(iteration))
|
||||
save_checkpoint(model, optimizer, learning_rate, iteration,
|
||||
checkpoint_path)
|
||||
|
||||
iteration += 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-o', '--output_directory', type=str,
|
||||
help='directory to save checkpoints')
|
||||
parser.add_argument('-l', '--log_directory', type=str,
|
||||
help='directory to save tensorboard logs')
|
||||
parser.add_argument('-c', '--checkpoint_path', type=str, default=None,
|
||||
required=False, help='checkpoint path')
|
||||
parser.add_argument('--warm_start', action='store_true',
|
||||
help='load model weights only, ignore specified layers')
|
||||
parser.add_argument('--n_gpus', type=int, default=1,
|
||||
required=False, help='number of gpus')
|
||||
parser.add_argument('--rank', type=int, default=0,
|
||||
required=False, help='rank of current gpu')
|
||||
parser.add_argument('--group_name', type=str, default='group_name',
|
||||
required=False, help='Distributed group name')
|
||||
parser.add_argument('--hparams', type=str,
|
||||
required=False, help='comma separated name=value pairs')
|
||||
|
||||
args = parser.parse_args()
|
||||
hparams = create_hparams(args.hparams)
|
||||
|
||||
torch.backends.cudnn.enabled = hparams.cudnn_enabled
|
||||
torch.backends.cudnn.benchmark = hparams.cudnn_benchmark
|
||||
|
||||
print("FP16 Run:", hparams.fp16_run)
|
||||
print("Dynamic Loss Scaling:", hparams.dynamic_loss_scaling)
|
||||
print("Distributed Run:", hparams.distributed_run)
|
||||
print("cuDNN Enabled:", hparams.cudnn_enabled)
|
||||
print("cuDNN Benchmark:", hparams.cudnn_benchmark)
|
||||
|
||||
train(args.output_directory, args.log_directory, args.checkpoint_path,
|
||||
args.warm_start, args.n_gpus, args.rank, args.group_name, hparams)
|
||||
177
tts.py
177
tts.py
|
|
@ -1,177 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
import sys
|
||||
import numpy as np
|
||||
import torch
|
||||
from hparams import create_hparams
|
||||
from model import Tacotron2
|
||||
from train import load_model
|
||||
from text import text_to_sequence
|
||||
import os
|
||||
import soundfile as sf
|
||||
import pyaudio
|
||||
import klepto
|
||||
from librosa import resample
|
||||
from librosa.effects import time_stretch
|
||||
from sia.file_utils import cached_model_path
|
||||
from sia.instruments import do_time
|
||||
from glow import WaveGlow
|
||||
|
||||
TTS_SAMPLE_RATE = 22050
|
||||
OUTPUT_SAMPLE_RATE = 16000
|
||||
|
||||
# https://github.com/NVIDIA/waveglow/blob/master/config.json
|
||||
WAVEGLOW_CONFIG = {
|
||||
"n_mel_channels": 80,
|
||||
"n_flows": 12,
|
||||
"n_group": 8,
|
||||
"n_early_every": 4,
|
||||
"n_early_size": 2,
|
||||
"WN_config": {
|
||||
"n_layers": 8,
|
||||
"n_channels": 256,
|
||||
"kernel_size": 3
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class TTSModel(object):
|
||||
"""docstring for TTSModel."""
|
||||
|
||||
def __init__(self):
|
||||
super(TTSModel, self).__init__()
|
||||
hparams = create_hparams()
|
||||
hparams.sampling_rate = TTS_SAMPLE_RATE
|
||||
self.model = load_model(hparams)
|
||||
tacotron2_path = cached_model_path("tacotron2_model")
|
||||
self.model.load_state_dict(
|
||||
torch.load(tacotron2_path, map_location='cpu')['state_dict'])
|
||||
self.model.eval()
|
||||
waveglow_path = cached_model_path('waveglow_model')
|
||||
self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
|
||||
wave_params = torch.load(waveglow_path, map_location='cpu')
|
||||
self.waveglow.load_state_dict(wave_params)
|
||||
self.waveglow.eval()
|
||||
for k in self.waveglow.convinv:
|
||||
k.float()
|
||||
self.k_cache = klepto.archives.file_archive(cached=False)
|
||||
self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
|
||||
self.synth_speech)
|
||||
|
||||
# https://github.com/NVIDIA/waveglow/issues/127
|
||||
for m in self.waveglow.modules():
|
||||
if 'Conv' in str(type(m)):
|
||||
setattr(m, 'padding_mode', 'zeros')
|
||||
|
||||
@do_time
|
||||
def synth_speech(self, t):
|
||||
text = t
|
||||
sequence = np.array(text_to_sequence(text,
|
||||
['english_cleaners']))[None, :]
|
||||
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
|
||||
mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
|
||||
sequence)
|
||||
with torch.no_grad():
|
||||
audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
|
||||
audio = audio_t[0].data.cpu().numpy()
|
||||
# data = convert(audio)
|
||||
slow_data = time_stretch(audio, 0.8)
|
||||
float_data = resample(slow_data, TTS_SAMPLE_RATE, OUTPUT_SAMPLE_RATE)
|
||||
data = float2pcm(float_data)
|
||||
return data.tobytes()
|
||||
|
||||
|
||||
def convert(array):
|
||||
sf.write('sample.wav', array, TTS_SAMPLE_RATE)
|
||||
# convert to $OUTPUT_SAMPLE_RATE
|
||||
os.system('ffmpeg -i {0} -filter:a "atempo=0.80" -ar 16k {1}'.format(
|
||||
'sample.wav', 'sample0.wav'))
|
||||
data, rate = sf.read('sample0.wav', dtype='int16')
|
||||
os.remove('sample.wav')
|
||||
os.remove('sample0.wav')
|
||||
return data
|
||||
|
||||
|
||||
# https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
|
||||
def float2pcm(sig, dtype='int16'):
|
||||
"""Convert floating point signal with a range from -1 to 1 to PCM.
|
||||
Any signal values outside the interval [-1.0, 1.0) are clipped.
|
||||
No dithering is used.
|
||||
Note that there are different possibilities for scaling floating
|
||||
point numbers to PCM numbers, this function implements just one of
|
||||
them. For an overview of alternatives see
|
||||
http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html
|
||||
Parameters
|
||||
----------
|
||||
sig : array_like
|
||||
Input array, must have floating point type.
|
||||
dtype : data type, optional
|
||||
Desired (integer) data type.
|
||||
Returns
|
||||
-------
|
||||
numpy.ndarray
|
||||
Integer data, scaled and clipped to the range of the given
|
||||
*dtype*.
|
||||
See Also
|
||||
--------
|
||||
pcm2float, dtype
|
||||
"""
|
||||
sig = np.asarray(sig)
|
||||
if sig.dtype.kind != 'f':
|
||||
raise TypeError("'sig' must be a float array")
|
||||
dtype = np.dtype(dtype)
|
||||
if dtype.kind not in 'iu':
|
||||
raise TypeError("'dtype' must be an integer type")
|
||||
|
||||
i = np.iinfo(dtype)
|
||||
abs_max = 2**(i.bits - 1)
|
||||
offset = i.min + abs_max
|
||||
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
|
||||
|
||||
|
||||
def display(data):
|
||||
import IPython.display as ipd
|
||||
aud = ipd.Audio(data, rate=16000)
|
||||
return aud
|
||||
|
||||
|
||||
def player_gen():
|
||||
audio_interface = pyaudio.PyAudio()
|
||||
_audio_stream = audio_interface.open(format=pyaudio.paInt16,
|
||||
channels=1,
|
||||
rate=OUTPUT_SAMPLE_RATE,
|
||||
output=True)
|
||||
|
||||
def play_device(data):
|
||||
_audio_stream.write(data)
|
||||
# _audio_stream.close()
|
||||
|
||||
return play_device
|
||||
|
||||
|
||||
def synthesize_corpus():
|
||||
tts_model = TTSModel()
|
||||
all_data = []
|
||||
for (i, line) in enumerate(open('corpus.txt').readlines()):
|
||||
print('synthesizing... "{}"'.format(line.strip()))
|
||||
data = tts_model.synth_speech(line.strip())
|
||||
all_data.append(data)
|
||||
return all_data
|
||||
|
||||
|
||||
def play_corpus(corpus_synths):
|
||||
player = player_gen()
|
||||
for d in corpus_synths:
|
||||
player(d)
|
||||
|
||||
|
||||
def main():
|
||||
corpus_synth_data = synthesize_corpus()
|
||||
play_corpus(corpus_synth_data)
|
||||
import ipdb
|
||||
ipdb.set_trace()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
29
utils.py
29
utils.py
|
|
@ -1,29 +0,0 @@
|
|||
import numpy as np
|
||||
from scipy.io.wavfile import read
|
||||
import torch
|
||||
|
||||
|
||||
def get_mask_from_lengths(lengths):
|
||||
max_len = torch.max(lengths).item()
|
||||
ids = torch.arange(0, max_len, out=torch.LongTensor(max_len)) #initially out = torch.LongTensor(max_len)
|
||||
mask = (ids < lengths.unsqueeze(1)).byte()
|
||||
return mask
|
||||
|
||||
|
||||
def load_wav_to_torch(full_path):
|
||||
sampling_rate, data = read(full_path)
|
||||
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
|
||||
|
||||
|
||||
def load_filepaths_and_text(filename, split="|"):
|
||||
with open(filename, encoding='utf-8') as f:
|
||||
filepaths_and_text = [line.strip().split(split) for line in f]
|
||||
return filepaths_and_text
|
||||
|
||||
|
||||
def to_gpu(x):
|
||||
x = x.contiguous()
|
||||
|
||||
#if torch.cuda.is_available(): #initially not commented out
|
||||
# x = x.cuda(non_blocking=True) # initially not commented out
|
||||
return torch.autograd.Variable(x)
|
||||
Loading…
Reference in New Issue