From 5f75aa0a0d134c8934b2c6c6afab69a17128e0a5 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Wed, 3 Jul 2019 15:08:00 +0530 Subject: [PATCH] integrate tacotron2/waveglow based tts server --- LICENSE | 29 --- README.md | 23 -- audio_processing.py | 93 -------- corpus.txt | 28 --- data_utils.py | 111 --------- demo_client.py | 27 --- distributed.py | 173 -------------- glow.py | 311 ------------------------- hparams.py | 96 -------- layers.py | 80 ------- logger.py | 48 ---- loss_function.py | 19 -- model.py | 529 ------------------------------------------- plotting_utils.py | 61 ----- requirements.txt | 10 - requirements_dev.txt | 15 -- server.py | 39 ---- stft.py | 141 ------------ text/__init__.py | 79 +++---- text/cleaners.py | 125 +++++----- text/cmudict.py | 170 ++++++++++---- text/numbers.py | 94 ++++---- text/symbols.py | 28 ++- train.py | 290 ------------------------ tts.py | 177 --------------- utils.py | 29 --- 26 files changed, 297 insertions(+), 2528 deletions(-) delete mode 100644 LICENSE delete mode 100644 README.md delete mode 100644 audio_processing.py delete mode 100644 corpus.txt delete mode 100644 data_utils.py delete mode 100644 demo_client.py delete mode 100644 distributed.py delete mode 100644 glow.py delete mode 100644 hparams.py delete mode 100644 layers.py delete mode 100644 logger.py delete mode 100644 loss_function.py delete mode 100644 model.py delete mode 100644 plotting_utils.py delete mode 100644 requirements.txt delete mode 100644 requirements_dev.txt delete mode 100644 server.py delete mode 100644 stft.py delete mode 100644 train.py delete mode 100644 tts.py delete mode 100644 utils.py diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 2a718d6..0000000 --- a/LICENSE +++ /dev/null @@ -1,29 +0,0 @@ -BSD 3-Clause License - -Copyright (c) 2018, NVIDIA Corporation -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -* Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md deleted file mode 100644 index a953b8f..0000000 --- a/README.md +++ /dev/null @@ -1,23 +0,0 @@ - -## Setup -- clone the repo - -`git clone https://github.com/agaralabs/tacotron2` -- cd to `tacotron2` copy models from wolverine: - -`scp wolverine:/home/ubuntu/tacotron2/{checkpoint_15000,waveglow_256channels.pt} ./` - -`scp wolverine:/home/ubuntu/tacotron2/waveglow ./` - -**Wolverine Details:** -``` -Host wolverine - Hostname 54.71.137.17 - User ubuntu - IdentityFile ~/.ssh/id_hip_ml -``` -install the dependencies -`pip install requirements.txt` - -## Running: -`python final.py` diff --git a/audio_processing.py b/audio_processing.py deleted file mode 100644 index b5af7f7..0000000 --- a/audio_processing.py +++ /dev/null @@ -1,93 +0,0 @@ -import torch -import numpy as np -from scipy.signal import get_window -import librosa.util as librosa_util - - -def window_sumsquare(window, n_frames, hop_length=200, win_length=800, - n_fft=800, dtype=np.float32, norm=None): - """ - # from librosa 0.6 - Compute the sum-square envelope of a window function at a given hop length. - - This is used to estimate modulation effects induced by windowing - observations in short-time fourier transforms. - - Parameters - ---------- - window : string, tuple, number, callable, or list-like - Window specification, as in `get_window` - - n_frames : int > 0 - The number of analysis frames - - hop_length : int > 0 - The number of samples to advance between frames - - win_length : [optional] - The length of the window function. By default, this matches `n_fft`. - - n_fft : int > 0 - The length of each analysis frame. - - dtype : np.dtype - The data type of the output - - Returns - ------- - wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` - The sum-squared envelope of the window function - """ - if win_length is None: - win_length = n_fft - - n = n_fft + hop_length * (n_frames - 1) - x = np.zeros(n, dtype=dtype) - - # Compute the squared window at the desired length - win_sq = get_window(window, win_length, fftbins=True) - win_sq = librosa_util.normalize(win_sq, norm=norm)**2 - win_sq = librosa_util.pad_center(win_sq, n_fft) - - # Fill the envelope - for i in range(n_frames): - sample = i * hop_length - x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] - return x - - -def griffin_lim(magnitudes, stft_fn, n_iters=30): - """ - PARAMS - ------ - magnitudes: spectrogram magnitudes - stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods - """ - - angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) - angles = angles.astype(np.float32) - angles = torch.autograd.Variable(torch.from_numpy(angles)) - signal = stft_fn.inverse(magnitudes, angles).squeeze(1) - - for i in range(n_iters): - _, angles = stft_fn.transform(signal) - signal = stft_fn.inverse(magnitudes, angles).squeeze(1) - return signal - - -def dynamic_range_compression(x, C=1, clip_val=1e-5): - """ - PARAMS - ------ - C: compression factor - """ - return torch.log(torch.clamp(x, min=clip_val) * C) - - -def dynamic_range_decompression(x, C=1): - """ - PARAMS - ------ - C: compression factor used to compress - """ - return torch.exp(x) / C diff --git a/corpus.txt b/corpus.txt deleted file mode 100644 index 1b8b0b1..0000000 --- a/corpus.txt +++ /dev/null @@ -1,28 +0,0 @@ -Thank you for calling Pampers. -How may I help you today? -I understand your frustration and disappointment. -I'm sorry it's happening and I'd like to help prevent it in the future. -What style of Baby Dry did you buy? -Was it the Regular or the Flexible? -I have all the information I need about the specifics of the product you purchased now. -Thank you for your patience! -How many diapers came in the package??? -And what size were they? -Were they the small. The medium or the large ones??? -Sorry, without the size and count information I will be able to reimburse you with only a minimum fulfillment. -Would that be okay?? -So you bought the Pampers Baby Dry and there were 32 diapers in the package. -Is that correct? -Thank you for all that information.' -I will definitely pass on your detailed feedback to our Quality Control Team! -I could also suggest a different variant of diapers, that might better suit your needs. -Would you like me to help you with that????? -How old is your little one? -And do you have specific diaper needs that you can help me with?? -Our cruzers are made especially for active babies, and I would definitely suggest them since you said your little one moves around a lot. -What I can do for you is, I can also include a coupon towards your next purchase of Pampers products. -Shall I go ahead with this? -I can send you a link over text message,,, from which you can directly order this. -Is there anything else I can help you with today? -Thank you for reaching out to us. -Have a good day!!! Bye! diff --git a/data_utils.py b/data_utils.py deleted file mode 100644 index fdfd287..0000000 --- a/data_utils.py +++ /dev/null @@ -1,111 +0,0 @@ -import random -import numpy as np -import torch -import torch.utils.data - -import layers -from utils import load_wav_to_torch, load_filepaths_and_text -from text import text_to_sequence - - -class TextMelLoader(torch.utils.data.Dataset): - """ - 1) loads audio,text pairs - 2) normalizes text and converts them to sequences of one-hot vectors - 3) computes mel-spectrograms from audio files. - """ - def __init__(self, audiopaths_and_text, hparams): - self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) - self.text_cleaners = hparams.text_cleaners - self.max_wav_value = hparams.max_wav_value - self.sampling_rate = hparams.sampling_rate - self.load_mel_from_disk = hparams.load_mel_from_disk - self.stft = layers.TacotronSTFT( - hparams.filter_length, hparams.hop_length, hparams.win_length, - hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, - hparams.mel_fmax) - random.seed(1234) - random.shuffle(self.audiopaths_and_text) - - def get_mel_text_pair(self, audiopath_and_text): - # separate filename and text - audiopath, text = audiopath_and_text[0], audiopath_and_text[1] - text = self.get_text(text) - mel = self.get_mel(audiopath) - return (text, mel) - - def get_mel(self, filename): - if not self.load_mel_from_disk: - audio, sampling_rate = load_wav_to_torch(filename) - if sampling_rate != self.stft.sampling_rate: - raise ValueError("{} {} SR doesn't match target {} SR".format( - sampling_rate, self.stft.sampling_rate)) - audio_norm = audio / self.max_wav_value - audio_norm = audio_norm.unsqueeze(0) - audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) - melspec = self.stft.mel_spectrogram(audio_norm) - melspec = torch.squeeze(melspec, 0) - else: - melspec = torch.from_numpy(np.load(filename)) - assert melspec.size(0) == self.stft.n_mel_channels, ( - 'Mel dimension mismatch: given {}, expected {}'.format( - melspec.size(0), self.stft.n_mel_channels)) - - return melspec - - def get_text(self, text): - text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners)) - return text_norm - - def __getitem__(self, index): - return self.get_mel_text_pair(self.audiopaths_and_text[index]) - - def __len__(self): - return len(self.audiopaths_and_text) - - -class TextMelCollate(): - """ Zero-pads model inputs and targets based on number of frames per setep - """ - def __init__(self, n_frames_per_step): - self.n_frames_per_step = n_frames_per_step - - def __call__(self, batch): - """Collate's training batch from normalized text and mel-spectrogram - PARAMS - ------ - batch: [text_normalized, mel_normalized] - """ - # Right zero-pad all one-hot text sequences to max input length - input_lengths, ids_sorted_decreasing = torch.sort( - torch.LongTensor([len(x[0]) for x in batch]), - dim=0, descending=True) - max_input_len = input_lengths[0] - - text_padded = torch.LongTensor(len(batch), max_input_len) - text_padded.zero_() - for i in range(len(ids_sorted_decreasing)): - text = batch[ids_sorted_decreasing[i]][0] - text_padded[i, :text.size(0)] = text - - # Right zero-pad mel-spec - num_mels = batch[0][1].size(0) - max_target_len = max([x[1].size(1) for x in batch]) - if max_target_len % self.n_frames_per_step != 0: - max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step - assert max_target_len % self.n_frames_per_step == 0 - - # include mel padded and gate padded - mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len) - mel_padded.zero_() - gate_padded = torch.FloatTensor(len(batch), max_target_len) - gate_padded.zero_() - output_lengths = torch.LongTensor(len(batch)) - for i in range(len(ids_sorted_decreasing)): - mel = batch[ids_sorted_decreasing[i]][1] - mel_padded[i, :, :mel.size(1)] = mel - gate_padded[i, mel.size(1)-1:] = 1 - output_lengths[i] = mel.size(1) - - return text_padded, input_lengths, mel_padded, gate_padded, \ - output_lengths diff --git a/demo_client.py b/demo_client.py deleted file mode 100644 index b00a53a..0000000 --- a/demo_client.py +++ /dev/null @@ -1,27 +0,0 @@ -import grpc -from sia.proto import tts_pb2 -from sia.proto import tts_pb2_grpc -from tts import player_gen - - -def tts_player(): - player = player_gen() - channel = grpc.insecure_channel('localhost:50060') - stub = tts_pb2_grpc.ServerStub(channel) - - def play(t): - test_text = tts_pb2.TextInput(text=t) - speech = stub.TextToSpeechAPI(test_text) - player(speech.response) - return play - - -def main(): - play = tts_player() - play('How may I help you today?') - import ipdb - ipdb.set_trace() - - -if __name__ == '__main__': - main() diff --git a/distributed.py b/distributed.py deleted file mode 100644 index cce7494..0000000 --- a/distributed.py +++ /dev/null @@ -1,173 +0,0 @@ -import torch -import torch.distributed as dist -from torch.nn.modules import Module -from torch.autograd import Variable - -def _flatten_dense_tensors(tensors): - """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of - same dense type. - Since inputs are dense, the resulting tensor will be a concatenated 1D - buffer. Element-wise operation on this buffer will be equivalent to - operating individually. - Arguments: - tensors (Iterable[Tensor]): dense tensors to flatten. - Returns: - A contiguous 1D buffer containing input tensors. - """ - if len(tensors) == 1: - return tensors[0].contiguous().view(-1) - flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0) - return flat - -def _unflatten_dense_tensors(flat, tensors): - """View a flat buffer using the sizes of tensors. Assume that tensors are of - same dense type, and that flat is given by _flatten_dense_tensors. - Arguments: - flat (Tensor): flattened dense tensors to unflatten. - tensors (Iterable[Tensor]): dense tensors whose sizes will be used to - unflatten flat. - Returns: - Unflattened dense tensors with sizes same as tensors and values from - flat. - """ - outputs = [] - offset = 0 - for tensor in tensors: - numel = tensor.numel() - outputs.append(flat.narrow(0, offset, numel).view_as(tensor)) - offset += numel - return tuple(outputs) - - -''' -This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py -launcher included with this example. It assumes that your run is using multiprocess with 1 -GPU/process, that the model is on the correct device, and that torch.set_device has been -used to set the device. - -Parameters are broadcasted to the other processes on initialization of DistributedDataParallel, -and will be allreduced at the finish of the backward pass. -''' -class DistributedDataParallel(Module): - - def __init__(self, module): - super(DistributedDataParallel, self).__init__() - #fallback for PyTorch 0.3 - if not hasattr(dist, '_backend'): - self.warn_on_half = True - else: - self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False - - self.module = module - - for p in self.module.state_dict().values(): - if not torch.is_tensor(p): - continue - dist.broadcast(p, 0) - - def allreduce_params(): - if(self.needs_reduction): - self.needs_reduction = False - buckets = {} - for param in self.module.parameters(): - if param.requires_grad and param.grad is not None: - tp = type(param.data) - if tp not in buckets: - buckets[tp] = [] - buckets[tp].append(param) - if self.warn_on_half: - if torch.cuda.HalfTensor in buckets: - print("WARNING: gloo dist backend for half parameters may be extremely slow." + - " It is recommended to use the NCCL backend in this case. This currently requires" + - "PyTorch built from top of tree master.") - self.warn_on_half = False - - for tp in buckets: - bucket = buckets[tp] - grads = [param.grad.data for param in bucket] - coalesced = _flatten_dense_tensors(grads) - dist.all_reduce(coalesced) - coalesced /= dist.get_world_size() - for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): - buf.copy_(synced) - - for param in list(self.module.parameters()): - def allreduce_hook(*unused): - param._execution_engine.queue_callback(allreduce_params) - if param.requires_grad: - param.register_hook(allreduce_hook) - - def forward(self, *inputs, **kwargs): - self.needs_reduction = True - return self.module(*inputs, **kwargs) - - ''' - def _sync_buffers(self): - buffers = list(self.module._all_buffers()) - if len(buffers) > 0: - # cross-node buffer sync - flat_buffers = _flatten_dense_tensors(buffers) - dist.broadcast(flat_buffers, 0) - for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)): - buf.copy_(synced) - def train(self, mode=True): - # Clear NCCL communicator and CUDA event cache of the default group ID, - # These cache will be recreated at the later call. This is currently a - # work-around for a potential NCCL deadlock. - if dist._backend == dist.dist_backend.NCCL: - dist._clear_group_cache() - super(DistributedDataParallel, self).train(mode) - self.module.train(mode) - ''' -''' -Modifies existing model to do gradient allreduce, but doesn't change class -so you don't need "module" -''' -def apply_gradient_allreduce(module): - if not hasattr(dist, '_backend'): - module.warn_on_half = True - else: - module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False - - for p in module.state_dict().values(): - if not torch.is_tensor(p): - continue - dist.broadcast(p, 0) - - def allreduce_params(): - if(module.needs_reduction): - module.needs_reduction = False - buckets = {} - for param in module.parameters(): - if param.requires_grad and param.grad is not None: - tp = param.data.dtype - if tp not in buckets: - buckets[tp] = [] - buckets[tp].append(param) - if module.warn_on_half: - if torch.cuda.HalfTensor in buckets: - print("WARNING: gloo dist backend for half parameters may be extremely slow." + - " It is recommended to use the NCCL backend in this case. This currently requires" + - "PyTorch built from top of tree master.") - module.warn_on_half = False - - for tp in buckets: - bucket = buckets[tp] - grads = [param.grad.data for param in bucket] - coalesced = _flatten_dense_tensors(grads) - dist.all_reduce(coalesced) - coalesced /= dist.get_world_size() - for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): - buf.copy_(synced) - - for param in list(module.parameters()): - def allreduce_hook(*unused): - Variable._execution_engine.queue_callback(allreduce_params) - if param.requires_grad: - param.register_hook(allreduce_hook) - - def set_needs_reduction(self, input, output): - self.needs_reduction = True - - module.register_forward_hook(set_needs_reduction) - return module diff --git a/glow.py b/glow.py deleted file mode 100644 index e5ce84a..0000000 --- a/glow.py +++ /dev/null @@ -1,311 +0,0 @@ -# ***************************************************************************** -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of the NVIDIA CORPORATION nor the -# names of its contributors may be used to endorse or promote products -# derived from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY -# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# ***************************************************************************** -import copy -import torch -from torch.autograd import Variable -import torch.nn.functional as F - - -@torch.jit.script -def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): - n_channels_int = n_channels[0] - in_act = input_a+input_b - t_act = torch.nn.functional.tanh(in_act[:, :n_channels_int, :]) - s_act = torch.nn.functional.sigmoid(in_act[:, n_channels_int:, :]) - acts = t_act * s_act - return acts - - -class WaveGlowLoss(torch.nn.Module): - def __init__(self, sigma=1.0): - super(WaveGlowLoss, self).__init__() - self.sigma = sigma - - def forward(self, model_output): - z, log_s_list, log_det_W_list = model_output - for i, log_s in enumerate(log_s_list): - if i == 0: - log_s_total = torch.sum(log_s) - log_det_W_total = log_det_W_list[i] - else: - log_s_total = log_s_total + torch.sum(log_s) - log_det_W_total += log_det_W_list[i] - - loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total - return loss/(z.size(0)*z.size(1)*z.size(2)) - - -class Invertible1x1Conv(torch.nn.Module): - """ - The layer outputs both the convolution, and the log determinant - of its weight matrix. If reverse=True it does convolution with - inverse - """ - def __init__(self, c): - super(Invertible1x1Conv, self).__init__() - self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0, - bias=False) - - # Sample a random orthonormal matrix to initialize weights - W = torch.qr(torch.FloatTensor(c, c).normal_())[0] - - # Ensure determinant is 1.0 not -1.0 - if torch.det(W) < 0: - W[:,0] = -1*W[:,0] - W = W.view(c, c, 1) - self.conv.weight.data = W - - def forward(self, z, reverse=False): - # shape - batch_size, group_size, n_of_groups = z.size() - - W = self.conv.weight.squeeze() - - if reverse: - if not hasattr(self, 'W_inverse'): - # Reverse computation - W_inverse = W.inverse() - W_inverse = Variable(W_inverse[..., None]) - if z.type() == 'torch.cuda.HalfTensor': - W_inverse = W_inverse.half() - self.W_inverse = W_inverse - z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) - return z - else: - # Forward computation - log_det_W = batch_size * n_of_groups * torch.logdet(W) - z = self.conv(z) - return z, log_det_W - - -class WN(torch.nn.Module): - """ - This is the WaveNet like layer for the affine coupling. The primary difference - from WaveNet is the convolutions need not be causal. There is also no dilation - size reset. The dilation only doubles on each layer - """ - def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels, - kernel_size): - super(WN, self).__init__() - assert(kernel_size % 2 == 1) - assert(n_channels % 2 == 0) - self.n_layers = n_layers - self.n_channels = n_channels - self.in_layers = torch.nn.ModuleList() - self.res_skip_layers = torch.nn.ModuleList() - self.cond_layers = torch.nn.ModuleList() - - start = torch.nn.Conv1d(n_in_channels, n_channels, 1) - start = torch.nn.utils.weight_norm(start, name='weight') - self.start = start - - # Initializing last layer to 0 makes the affine coupling layers - # do nothing at first. This helps with training stability - end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1) - end.weight.data.zero_() - end.bias.data.zero_() - self.end = end - - for i in range(n_layers): - dilation = 2 ** i - padding = int((kernel_size*dilation - dilation)/2) - in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size, - dilation=dilation, padding=padding) - in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') - self.in_layers.append(in_layer) - - cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels, 1) - cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') - self.cond_layers.append(cond_layer) - - # last one is not necessary - if i < n_layers - 1: - res_skip_channels = 2*n_channels - else: - res_skip_channels = n_channels - res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') - self.res_skip_layers.append(res_skip_layer) - - def forward(self, forward_input): - audio, spect = forward_input - audio = self.start(audio) - for i in range(self.n_layers): - acts = fused_add_tanh_sigmoid_multiply( - self.in_layers[i](audio), - self.cond_layers[i](spect), - torch.IntTensor([self.n_channels])) - - res_skip_acts = self.res_skip_layers[i](acts) - if i < self.n_layers - 1: - audio = res_skip_acts[:,:self.n_channels,:] + audio - skip_acts = res_skip_acts[:,self.n_channels:,:] - else: - skip_acts = res_skip_acts - - if i == 0: - output = skip_acts - else: - output = skip_acts + output - return self.end(output) - - -class WaveGlow(torch.nn.Module): - def __init__(self, n_mel_channels, n_flows, n_group, n_early_every, - n_early_size, WN_config): - super(WaveGlow, self).__init__() - - self.upsample = torch.nn.ConvTranspose1d(n_mel_channels, - n_mel_channels, - 1024, stride=256) - assert(n_group % 2 == 0) - self.n_flows = n_flows - self.n_group = n_group - self.n_early_every = n_early_every - self.n_early_size = n_early_size - self.WN = torch.nn.ModuleList() - self.convinv = torch.nn.ModuleList() - - n_half = int(n_group/2) - - # Set up layers with the right sizes based on how many dimensions - # have been output already - n_remaining_channels = n_group - for k in range(n_flows): - if k % self.n_early_every == 0 and k > 0: - n_half = n_half - int(self.n_early_size/2) - n_remaining_channels = n_remaining_channels - self.n_early_size - self.convinv.append(Invertible1x1Conv(n_remaining_channels)) - self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config)) - self.n_remaining_channels = n_remaining_channels # Useful during inference - - def forward(self, forward_input): - """ - forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames - forward_input[1] = audio: batch x time - """ - spect, audio = forward_input - - # Upsample spectrogram to size of audio - spect = self.upsample(spect) - assert(spect.size(2) >= audio.size(1)) - if spect.size(2) > audio.size(1): - spect = spect[:, :, :audio.size(1)] - - spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) - spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) - - audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1) - output_audio = [] - log_s_list = [] - log_det_W_list = [] - - for k in range(self.n_flows): - if k % self.n_early_every == 0 and k > 0: - output_audio.append(audio[:,:self.n_early_size,:]) - audio = audio[:,self.n_early_size:,:] - - audio, log_det_W = self.convinv[k](audio) - log_det_W_list.append(log_det_W) - - n_half = int(audio.size(1)/2) - audio_0 = audio[:,:n_half,:] - audio_1 = audio[:,n_half:,:] - - output = self.WN[k]((audio_0, spect)) - log_s = output[:, n_half:, :] - b = output[:, :n_half, :] - audio_1 = torch.exp(log_s)*audio_1 + b - log_s_list.append(log_s) - - audio = torch.cat([audio_0, audio_1],1) - - output_audio.append(audio) - return torch.cat(output_audio,1), log_s_list, log_det_W_list - - def infer(self, spect, sigma=1.0): - spect = self.upsample(spect) - # trim conv artifacts. maybe pad spec to kernel multiple - time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0] - spect = spect[:, :, :-time_cutoff] - - spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) - spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) - - if spect.type() == 'torch.cuda.HalfTensor': - audio = torch.cuda.HalfTensor(spect.size(0), - self.n_remaining_channels, - spect.size(2)).normal_() - else: - # cuda.FloatTensor -> FloatTensor - audio = torch.FloatTensor(spect.size(0), - self.n_remaining_channels, - spect.size(2)).normal_() - - audio = torch.autograd.Variable(sigma*audio) - - for k in reversed(range(self.n_flows)): - n_half = int(audio.size(1)/2) - audio_0 = audio[:,:n_half,:] - audio_1 = audio[:,n_half:,:] - - output = self.WN[k]((audio_0, spect)) - s = output[:, n_half:, :] - b = output[:, :n_half, :] - audio_1 = (audio_1 - b)/torch.exp(s) - audio = torch.cat([audio_0, audio_1],1) - - audio = self.convinv[k](audio, reverse=True) - - if k % self.n_early_every == 0 and k > 0: - if spect.type() == 'torch.cuda.HalfTensor': - z = torch.cuda.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_() - else: - # cuda.FloatTensor -> FloatTensor - z = torch.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_() - audio = torch.cat((sigma*z, audio),1) - - audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data - return audio - - @staticmethod - def remove_weightnorm(model): - waveglow = model - for WN in waveglow.WN: - WN.start = torch.nn.utils.remove_weight_norm(WN.start) - WN.in_layers = remove(WN.in_layers) - WN.cond_layers = remove(WN.cond_layers) - WN.res_skip_layers = remove(WN.res_skip_layers) - return waveglow - - -def remove(conv_list): - new_conv_list = torch.nn.ModuleList() - for old_conv in conv_list: - old_conv = torch.nn.utils.remove_weight_norm(old_conv) - new_conv_list.append(old_conv) - return new_conv_list diff --git a/hparams.py b/hparams.py deleted file mode 100644 index 9704bbd..0000000 --- a/hparams.py +++ /dev/null @@ -1,96 +0,0 @@ -import tensorflow as tf -from text import symbols - - -# changed path, sampling rate and batch size -def create_hparams(hparams_string=None, verbose=False): - """Create model hyperparameters. Parse nondefault from given string.""" - - hparams = tf.contrib.training.HParams( - ################################ - # Experiment Parameters # - ################################ - epochs=500, - iters_per_checkpoint=1000, - seed=1234, - dynamic_loss_scaling=True, - fp16_run=False, - distributed_run=False, - dist_backend="nccl", - dist_url="tcp://localhost:54321", - cudnn_enabled=True, - cudnn_benchmark=False, - ignore_layers=['embedding.weight'], - - ################################ - # Data Parameters # - ################################ - load_mel_from_disk=False, - training_files='lists/tts_data_train_processed.txt', - validation_files='filelists/tts_data_val_processed.txt', - text_cleaners=['english_cleaners'], - - ################################ - # Audio Parameters # - ################################ - max_wav_value=32768.0, - sampling_rate=16000, - filter_length=1024, - hop_length=256, - win_length=1024, - n_mel_channels=80, - mel_fmin=0.0, - mel_fmax=8000.0, - - ################################ - # Model Parameters # - ################################ - n_symbols=len(symbols), - symbols_embedding_dim=512, - - # Encoder parameters - encoder_kernel_size=5, - encoder_n_convolutions=3, - encoder_embedding_dim=512, - - # Decoder parameters - n_frames_per_step=1, # currently only 1 is supported - decoder_rnn_dim=1024, - prenet_dim=256, - max_decoder_steps=1000, - gate_threshold=0.5, - p_attention_dropout=0.1, - p_decoder_dropout=0.1, - - # Attention parameters - attention_rnn_dim=1024, - attention_dim=128, - - # Location Layer parameters - attention_location_n_filters=32, - attention_location_kernel_size=31, - - # Mel-post processing network parameters - postnet_embedding_dim=512, - postnet_kernel_size=5, - postnet_n_convolutions=5, - - ################################ - # Optimization Hyperparameters # - ################################ - use_saved_learning_rate=False, - learning_rate=1e-3, - weight_decay=1e-6, - grad_clip_thresh=1.0, - batch_size=4, - mask_padding=True # set model's padded outputs to padded values - ) - - if hparams_string: - tf.logging.info('Parsing command line hparams: %s', hparams_string) - hparams.parse(hparams_string) - - if verbose: - tf.logging.info('Final parsed hparams: %s', hparams.values()) - - return hparams diff --git a/layers.py b/layers.py deleted file mode 100644 index 615a64a..0000000 --- a/layers.py +++ /dev/null @@ -1,80 +0,0 @@ -import torch -from librosa.filters import mel as librosa_mel_fn -from audio_processing import dynamic_range_compression -from audio_processing import dynamic_range_decompression -from stft import STFT - - -class LinearNorm(torch.nn.Module): - def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'): - super(LinearNorm, self).__init__() - self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias) - - torch.nn.init.xavier_uniform_( - self.linear_layer.weight, - gain=torch.nn.init.calculate_gain(w_init_gain)) - - def forward(self, x): - return self.linear_layer(x) - - -class ConvNorm(torch.nn.Module): - def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, - padding=None, dilation=1, bias=True, w_init_gain='linear'): - super(ConvNorm, self).__init__() - if padding is None: - assert(kernel_size % 2 == 1) - padding = int(dilation * (kernel_size - 1) / 2) - - self.conv = torch.nn.Conv1d(in_channels, out_channels, - kernel_size=kernel_size, stride=stride, - padding=padding, dilation=dilation, - bias=bias) - - torch.nn.init.xavier_uniform_( - self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) - - def forward(self, signal): - conv_signal = self.conv(signal) - return conv_signal - - -class TacotronSTFT(torch.nn.Module): - def __init__(self, filter_length=1024, hop_length=256, win_length=1024, - n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, - mel_fmax=8000.0): - super(TacotronSTFT, self).__init__() - self.n_mel_channels = n_mel_channels - self.sampling_rate = sampling_rate - self.stft_fn = STFT(filter_length, hop_length, win_length) - mel_basis = librosa_mel_fn( - sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) - mel_basis = torch.from_numpy(mel_basis).float() - self.register_buffer('mel_basis', mel_basis) - - def spectral_normalize(self, magnitudes): - output = dynamic_range_compression(magnitudes) - return output - - def spectral_de_normalize(self, magnitudes): - output = dynamic_range_decompression(magnitudes) - return output - - def mel_spectrogram(self, y): - """Computes mel-spectrograms from a batch of waves - PARAMS - ------ - y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] - - RETURNS - ------- - mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) - """ - assert(torch.min(y.data) >= -1) - assert(torch.max(y.data) <= 1) - - magnitudes, phases = self.stft_fn.transform(y) - magnitudes = magnitudes.data - mel_output = torch.matmul(self.mel_basis, magnitudes) - mel_output = self.spectral_normalize(mel_output) - return mel_output diff --git a/logger.py b/logger.py deleted file mode 100644 index 9b999ad..0000000 --- a/logger.py +++ /dev/null @@ -1,48 +0,0 @@ -import random -import torch -from tensorboardX import SummaryWriter -from plotting_utils import plot_alignment_to_numpy, plot_spectrogram_to_numpy -from plotting_utils import plot_gate_outputs_to_numpy - - -class Tacotron2Logger(SummaryWriter): - def __init__(self, logdir): - super(Tacotron2Logger, self).__init__(logdir) - - def log_training(self, reduced_loss, grad_norm, learning_rate, duration, - iteration): - self.add_scalar("training.loss", reduced_loss, iteration) - self.add_scalar("grad.norm", grad_norm, iteration) - self.add_scalar("learning.rate", learning_rate, iteration) - self.add_scalar("duration", duration, iteration) - - def log_validation(self, reduced_loss, model, y, y_pred, iteration): - self.add_scalar("validation.loss", reduced_loss, iteration) - _, mel_outputs, gate_outputs, alignments = y_pred - mel_targets, gate_targets = y - - # plot distribution of parameters - for tag, value in model.named_parameters(): - tag = tag.replace('.', '/') - self.add_histogram(tag, value.data.cpu().numpy(), iteration) - - # plot alignment, mel target and predicted, gate target and predicted - idx = random.randint(0, alignments.size(0) - 1) - self.add_image( - "alignment", - plot_alignment_to_numpy(alignments[idx].data.cpu().numpy().T), - iteration) - self.add_image( - "mel_target", - plot_spectrogram_to_numpy(mel_targets[idx].data.cpu().numpy()), - iteration) - self.add_image( - "mel_predicted", - plot_spectrogram_to_numpy(mel_outputs[idx].data.cpu().numpy()), - iteration) - self.add_image( - "gate", - plot_gate_outputs_to_numpy( - gate_targets[idx].data.cpu().numpy(), - torch.sigmoid(gate_outputs[idx]).data.cpu().numpy()), - iteration) diff --git a/loss_function.py b/loss_function.py deleted file mode 100644 index 99cae95..0000000 --- a/loss_function.py +++ /dev/null @@ -1,19 +0,0 @@ -from torch import nn - - -class Tacotron2Loss(nn.Module): - def __init__(self): - super(Tacotron2Loss, self).__init__() - - def forward(self, model_output, targets): - mel_target, gate_target = targets[0], targets[1] - mel_target.requires_grad = False - gate_target.requires_grad = False - gate_target = gate_target.view(-1, 1) - - mel_out, mel_out_postnet, gate_out, _ = model_output - gate_out = gate_out.view(-1, 1) - mel_loss = nn.MSELoss()(mel_out, mel_target) + \ - nn.MSELoss()(mel_out_postnet, mel_target) - gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target) - return mel_loss + gate_loss diff --git a/model.py b/model.py deleted file mode 100644 index 4c7d7d2..0000000 --- a/model.py +++ /dev/null @@ -1,529 +0,0 @@ -from math import sqrt -import torch -from torch.autograd import Variable -from torch import nn -from torch.nn import functional as F -from layers import ConvNorm, LinearNorm -from utils import to_gpu, get_mask_from_lengths - - -class LocationLayer(nn.Module): - def __init__(self, attention_n_filters, attention_kernel_size, - attention_dim): - super(LocationLayer, self).__init__() - padding = int((attention_kernel_size - 1) / 2) - self.location_conv = ConvNorm(2, attention_n_filters, - kernel_size=attention_kernel_size, - padding=padding, bias=False, stride=1, - dilation=1) - self.location_dense = LinearNorm(attention_n_filters, attention_dim, - bias=False, w_init_gain='tanh') - - def forward(self, attention_weights_cat): - processed_attention = self.location_conv(attention_weights_cat) - processed_attention = processed_attention.transpose(1, 2) - processed_attention = self.location_dense(processed_attention) - return processed_attention - - -class Attention(nn.Module): - def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, - attention_location_n_filters, attention_location_kernel_size): - super(Attention, self).__init__() - self.query_layer = LinearNorm(attention_rnn_dim, attention_dim, - bias=False, w_init_gain='tanh') - self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False, - w_init_gain='tanh') - self.v = LinearNorm(attention_dim, 1, bias=False) - self.location_layer = LocationLayer(attention_location_n_filters, - attention_location_kernel_size, - attention_dim) - self.score_mask_value = -float("inf") - - def get_alignment_energies(self, query, processed_memory, - attention_weights_cat): - """ - PARAMS - ------ - query: decoder output (batch, n_mel_channels * n_frames_per_step) - processed_memory: processed encoder outputs (B, T_in, attention_dim) - attention_weights_cat: cumulative and prev. att weights (B, 2, max_time) - - RETURNS - ------- - alignment (batch, max_time) - """ - - processed_query = self.query_layer(query.unsqueeze(1)) - processed_attention_weights = self.location_layer(attention_weights_cat) - energies = self.v(torch.tanh( - processed_query + processed_attention_weights + processed_memory)) - - energies = energies.squeeze(-1) - return energies - - def forward(self, attention_hidden_state, memory, processed_memory, - attention_weights_cat, mask): - """ - PARAMS - ------ - attention_hidden_state: attention rnn last output - memory: encoder outputs - processed_memory: processed encoder outputs - attention_weights_cat: previous and cummulative attention weights - mask: binary mask for padded data - """ - alignment = self.get_alignment_energies( - attention_hidden_state, processed_memory, attention_weights_cat) - - if mask is not None: - alignment.data.masked_fill_(mask, self.score_mask_value) - - attention_weights = F.softmax(alignment, dim=1) - attention_context = torch.bmm(attention_weights.unsqueeze(1), memory) - attention_context = attention_context.squeeze(1) - - return attention_context, attention_weights - - -class Prenet(nn.Module): - def __init__(self, in_dim, sizes): - super(Prenet, self).__init__() - in_sizes = [in_dim] + sizes[:-1] - self.layers = nn.ModuleList( - [LinearNorm(in_size, out_size, bias=False) - for (in_size, out_size) in zip(in_sizes, sizes)]) - - def forward(self, x): - for linear in self.layers: - x = F.dropout(F.relu(linear(x)), p=0.5, training=True) - return x - - -class Postnet(nn.Module): - """Postnet - - Five 1-d convolution with 512 channels and kernel size 5 - """ - - def __init__(self, hparams): - super(Postnet, self).__init__() - self.convolutions = nn.ModuleList() - - self.convolutions.append( - nn.Sequential( - ConvNorm(hparams.n_mel_channels, hparams.postnet_embedding_dim, - kernel_size=hparams.postnet_kernel_size, stride=1, - padding=int((hparams.postnet_kernel_size - 1) / 2), - dilation=1, w_init_gain='tanh'), - nn.BatchNorm1d(hparams.postnet_embedding_dim)) - ) - - for i in range(1, hparams.postnet_n_convolutions - 1): - self.convolutions.append( - nn.Sequential( - ConvNorm(hparams.postnet_embedding_dim, - hparams.postnet_embedding_dim, - kernel_size=hparams.postnet_kernel_size, stride=1, - padding=int((hparams.postnet_kernel_size - 1) / 2), - dilation=1, w_init_gain='tanh'), - nn.BatchNorm1d(hparams.postnet_embedding_dim)) - ) - - self.convolutions.append( - nn.Sequential( - ConvNorm(hparams.postnet_embedding_dim, hparams.n_mel_channels, - kernel_size=hparams.postnet_kernel_size, stride=1, - padding=int((hparams.postnet_kernel_size - 1) / 2), - dilation=1, w_init_gain='linear'), - nn.BatchNorm1d(hparams.n_mel_channels)) - ) - - def forward(self, x): - for i in range(len(self.convolutions) - 1): - x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training) - x = F.dropout(self.convolutions[-1](x), 0.5, self.training) - - return x - - -class Encoder(nn.Module): - """Encoder module: - - Three 1-d convolution banks - - Bidirectional LSTM - """ - def __init__(self, hparams): - super(Encoder, self).__init__() - - convolutions = [] - for _ in range(hparams.encoder_n_convolutions): - conv_layer = nn.Sequential( - ConvNorm(hparams.encoder_embedding_dim, - hparams.encoder_embedding_dim, - kernel_size=hparams.encoder_kernel_size, stride=1, - padding=int((hparams.encoder_kernel_size - 1) / 2), - dilation=1, w_init_gain='relu'), - nn.BatchNorm1d(hparams.encoder_embedding_dim)) - convolutions.append(conv_layer) - self.convolutions = nn.ModuleList(convolutions) - - self.lstm = nn.LSTM(hparams.encoder_embedding_dim, - int(hparams.encoder_embedding_dim / 2), 1, - batch_first=True, bidirectional=True) - - def forward(self, x, input_lengths): - for conv in self.convolutions: - x = F.dropout(F.relu(conv(x)), 0.5, self.training) - - x = x.transpose(1, 2) - - # pytorch tensor are not reversible, hence the conversion - input_lengths = input_lengths.cpu().numpy() - x = nn.utils.rnn.pack_padded_sequence( - x, input_lengths, batch_first=True) - - self.lstm.flatten_parameters() - outputs, _ = self.lstm(x) - - outputs, _ = nn.utils.rnn.pad_packed_sequence( - outputs, batch_first=True) - - return outputs - - def inference(self, x): - for conv in self.convolutions: - x = F.dropout(F.relu(conv(x)), 0.5, self.training) - - x = x.transpose(1, 2) - - self.lstm.flatten_parameters() - outputs, _ = self.lstm(x) - - return outputs - - -class Decoder(nn.Module): - def __init__(self, hparams): - super(Decoder, self).__init__() - self.n_mel_channels = hparams.n_mel_channels - self.n_frames_per_step = hparams.n_frames_per_step - self.encoder_embedding_dim = hparams.encoder_embedding_dim - self.attention_rnn_dim = hparams.attention_rnn_dim - self.decoder_rnn_dim = hparams.decoder_rnn_dim - self.prenet_dim = hparams.prenet_dim - self.max_decoder_steps = hparams.max_decoder_steps - self.gate_threshold = hparams.gate_threshold - self.p_attention_dropout = hparams.p_attention_dropout - self.p_decoder_dropout = hparams.p_decoder_dropout - - self.prenet = Prenet( - hparams.n_mel_channels * hparams.n_frames_per_step, - [hparams.prenet_dim, hparams.prenet_dim]) - - self.attention_rnn = nn.LSTMCell( - hparams.prenet_dim + hparams.encoder_embedding_dim, - hparams.attention_rnn_dim) - - self.attention_layer = Attention( - hparams.attention_rnn_dim, hparams.encoder_embedding_dim, - hparams.attention_dim, hparams.attention_location_n_filters, - hparams.attention_location_kernel_size) - - self.decoder_rnn = nn.LSTMCell( - hparams.attention_rnn_dim + hparams.encoder_embedding_dim, - hparams.decoder_rnn_dim, 1) - - self.linear_projection = LinearNorm( - hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, - hparams.n_mel_channels * hparams.n_frames_per_step) - - self.gate_layer = LinearNorm( - hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1, - bias=True, w_init_gain='sigmoid') - - def get_go_frame(self, memory): - """ Gets all zeros frames to use as first decoder input - PARAMS - ------ - memory: decoder outputs - - RETURNS - ------- - decoder_input: all zeros frames - """ - B = memory.size(0) - decoder_input = Variable(memory.data.new( - B, self.n_mel_channels * self.n_frames_per_step).zero_()) - return decoder_input - - def initialize_decoder_states(self, memory, mask): - """ Initializes attention rnn states, decoder rnn states, attention - weights, attention cumulative weights, attention context, stores memory - and stores processed memory - PARAMS - ------ - memory: Encoder outputs - mask: Mask for padded data if training, expects None for inference - """ - B = memory.size(0) - MAX_TIME = memory.size(1) - - self.attention_hidden = Variable(memory.data.new( - B, self.attention_rnn_dim).zero_()) - self.attention_cell = Variable(memory.data.new( - B, self.attention_rnn_dim).zero_()) - - self.decoder_hidden = Variable(memory.data.new( - B, self.decoder_rnn_dim).zero_()) - self.decoder_cell = Variable(memory.data.new( - B, self.decoder_rnn_dim).zero_()) - - self.attention_weights = Variable(memory.data.new( - B, MAX_TIME).zero_()) - self.attention_weights_cum = Variable(memory.data.new( - B, MAX_TIME).zero_()) - self.attention_context = Variable(memory.data.new( - B, self.encoder_embedding_dim).zero_()) - - self.memory = memory - self.processed_memory = self.attention_layer.memory_layer(memory) - self.mask = mask - - def parse_decoder_inputs(self, decoder_inputs): - """ Prepares decoder inputs, i.e. mel outputs - PARAMS - ------ - decoder_inputs: inputs used for teacher-forced training, i.e. mel-specs - - RETURNS - ------- - inputs: processed decoder inputs - - """ - # (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels) - decoder_inputs = decoder_inputs.transpose(1, 2) - decoder_inputs = decoder_inputs.view( - decoder_inputs.size(0), - int(decoder_inputs.size(1)/self.n_frames_per_step), -1) - # (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels) - decoder_inputs = decoder_inputs.transpose(0, 1) - return decoder_inputs - - def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments): - """ Prepares decoder outputs for output - PARAMS - ------ - mel_outputs: - gate_outputs: gate output energies - alignments: - - RETURNS - ------- - mel_outputs: - gate_outpust: gate output energies - alignments: - """ - # (T_out, B) -> (B, T_out) - alignments = torch.stack(alignments).transpose(0, 1) - # (T_out, B) -> (B, T_out) - gate_outputs = torch.stack(gate_outputs).transpose(0, 1) - gate_outputs = gate_outputs.contiguous() - # (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels) - mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous() - # decouple frames per step - mel_outputs = mel_outputs.view( - mel_outputs.size(0), -1, self.n_mel_channels) - # (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out) - mel_outputs = mel_outputs.transpose(1, 2) - - return mel_outputs, gate_outputs, alignments - - def decode(self, decoder_input): - """ Decoder step using stored states, attention and memory - PARAMS - ------ - decoder_input: previous mel output - - RETURNS - ------- - mel_output: - gate_output: gate output energies - attention_weights: - """ - cell_input = torch.cat((decoder_input, self.attention_context), -1) - self.attention_hidden, self.attention_cell = self.attention_rnn( - cell_input, (self.attention_hidden, self.attention_cell)) - self.attention_hidden = F.dropout( - self.attention_hidden, self.p_attention_dropout, self.training) - - attention_weights_cat = torch.cat( - (self.attention_weights.unsqueeze(1), - self.attention_weights_cum.unsqueeze(1)), dim=1) - self.attention_context, self.attention_weights = self.attention_layer( - self.attention_hidden, self.memory, self.processed_memory, - attention_weights_cat, self.mask) - - self.attention_weights_cum += self.attention_weights - decoder_input = torch.cat( - (self.attention_hidden, self.attention_context), -1) - self.decoder_hidden, self.decoder_cell = self.decoder_rnn( - decoder_input, (self.decoder_hidden, self.decoder_cell)) - self.decoder_hidden = F.dropout( - self.decoder_hidden, self.p_decoder_dropout, self.training) - - decoder_hidden_attention_context = torch.cat( - (self.decoder_hidden, self.attention_context), dim=1) - decoder_output = self.linear_projection( - decoder_hidden_attention_context) - - gate_prediction = self.gate_layer(decoder_hidden_attention_context) - return decoder_output, gate_prediction, self.attention_weights - - def forward(self, memory, decoder_inputs, memory_lengths): - """ Decoder forward pass for training - PARAMS - ------ - memory: Encoder outputs - decoder_inputs: Decoder inputs for teacher forcing. i.e. mel-specs - memory_lengths: Encoder output lengths for attention masking. - - RETURNS - ------- - mel_outputs: mel outputs from the decoder - gate_outputs: gate outputs from the decoder - alignments: sequence of attention weights from the decoder - """ - - decoder_input = self.get_go_frame(memory).unsqueeze(0) - decoder_inputs = self.parse_decoder_inputs(decoder_inputs) - decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0) - decoder_inputs = self.prenet(decoder_inputs) - - self.initialize_decoder_states( - memory, mask=~get_mask_from_lengths(memory_lengths)) - - mel_outputs, gate_outputs, alignments = [], [], [] - while len(mel_outputs) < decoder_inputs.size(0) - 1: - decoder_input = decoder_inputs[len(mel_outputs)] - mel_output, gate_output, attention_weights = self.decode( - decoder_input) - mel_outputs += [mel_output.squeeze(1)] - gate_outputs += [gate_output.squeeze()] - alignments += [attention_weights] - - mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs( - mel_outputs, gate_outputs, alignments) - - return mel_outputs, gate_outputs, alignments - - def inference(self, memory): - """ Decoder inference - PARAMS - ------ - memory: Encoder outputs - - RETURNS - ------- - mel_outputs: mel outputs from the decoder - gate_outputs: gate outputs from the decoder - alignments: sequence of attention weights from the decoder - """ - decoder_input = self.get_go_frame(memory) - - self.initialize_decoder_states(memory, mask=None) - - mel_outputs, gate_outputs, alignments = [], [], [] - while True: - decoder_input = self.prenet(decoder_input) - mel_output, gate_output, alignment = self.decode(decoder_input) - - mel_outputs += [mel_output.squeeze(1)] - gate_outputs += [gate_output] - alignments += [alignment] - - if torch.sigmoid(gate_output.data) > self.gate_threshold: - break - elif len(mel_outputs) == self.max_decoder_steps: - print("Warning! Reached max decoder steps") - break - - decoder_input = mel_output - - mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs( - mel_outputs, gate_outputs, alignments) - - return mel_outputs, gate_outputs, alignments - - -class Tacotron2(nn.Module): - def __init__(self, hparams): - super(Tacotron2, self).__init__() - self.mask_padding = hparams.mask_padding - self.fp16_run = hparams.fp16_run - self.n_mel_channels = hparams.n_mel_channels - self.n_frames_per_step = hparams.n_frames_per_step - self.embedding = nn.Embedding( - hparams.n_symbols, hparams.symbols_embedding_dim) - std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim)) - val = sqrt(3.0) * std # uniform bounds for std - self.embedding.weight.data.uniform_(-val, val) - self.encoder = Encoder(hparams) - self.decoder = Decoder(hparams) - self.postnet = Postnet(hparams) - - def parse_batch(self, batch): - text_padded, input_lengths, mel_padded, gate_padded, \ - output_lengths = batch - text_padded = to_gpu(text_padded).long() - input_lengths = to_gpu(input_lengths).long() - max_len = torch.max(input_lengths.data).item() - mel_padded = to_gpu(mel_padded).float() - gate_padded = to_gpu(gate_padded).float() - output_lengths = to_gpu(output_lengths).long() - - return ( - (text_padded, input_lengths, mel_padded, max_len, output_lengths), - (mel_padded, gate_padded)) - - def parse_output(self, outputs, output_lengths=None): - if self.mask_padding and output_lengths is not None: - mask = ~get_mask_from_lengths(output_lengths) - mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1)) - mask = mask.permute(1, 0, 2) - - outputs[0].data.masked_fill_(mask, 0.0) - outputs[1].data.masked_fill_(mask, 0.0) - outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies - - return outputs - - def forward(self, inputs): - text_inputs, text_lengths, mels, max_len, output_lengths = inputs - text_lengths, output_lengths = text_lengths.data, output_lengths.data - - embedded_inputs = self.embedding(text_inputs).transpose(1, 2) - - encoder_outputs = self.encoder(embedded_inputs, text_lengths) - - mel_outputs, gate_outputs, alignments = self.decoder( - encoder_outputs, mels, memory_lengths=text_lengths) - - mel_outputs_postnet = self.postnet(mel_outputs) - mel_outputs_postnet = mel_outputs + mel_outputs_postnet - - return self.parse_output( - [mel_outputs, mel_outputs_postnet, gate_outputs, alignments], - output_lengths) - - def inference(self, inputs): - embedded_inputs = self.embedding(inputs).transpose(1, 2) - encoder_outputs = self.encoder.inference(embedded_inputs) - mel_outputs, gate_outputs, alignments = self.decoder.inference( - encoder_outputs) - - mel_outputs_postnet = self.postnet(mel_outputs) - mel_outputs_postnet = mel_outputs + mel_outputs_postnet - - outputs = self.parse_output( - [mel_outputs, mel_outputs_postnet, gate_outputs, alignments]) - - return outputs diff --git a/plotting_utils.py b/plotting_utils.py deleted file mode 100644 index ca7e168..0000000 --- a/plotting_utils.py +++ /dev/null @@ -1,61 +0,0 @@ -import matplotlib -matplotlib.use("Agg") -import matplotlib.pylab as plt -import numpy as np - - -def save_figure_to_numpy(fig): - # save it to a numpy array. - data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') - data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) - return data - - -def plot_alignment_to_numpy(alignment, info=None): - fig, ax = plt.subplots(figsize=(6, 4)) - im = ax.imshow(alignment, aspect='auto', origin='lower', - interpolation='none') - fig.colorbar(im, ax=ax) - xlabel = 'Decoder timestep' - if info is not None: - xlabel += '\n\n' + info - plt.xlabel(xlabel) - plt.ylabel('Encoder timestep') - plt.tight_layout() - - fig.canvas.draw() - data = save_figure_to_numpy(fig) - plt.close() - return data - - -def plot_spectrogram_to_numpy(spectrogram): - fig, ax = plt.subplots(figsize=(12, 3)) - im = ax.imshow(spectrogram, aspect="auto", origin="lower", - interpolation='none') - plt.colorbar(im, ax=ax) - plt.xlabel("Frames") - plt.ylabel("Channels") - plt.tight_layout() - - fig.canvas.draw() - data = save_figure_to_numpy(fig) - plt.close() - return data - - -def plot_gate_outputs_to_numpy(gate_targets, gate_outputs): - fig, ax = plt.subplots(figsize=(12, 3)) - ax.scatter(range(len(gate_targets)), gate_targets, alpha=0.5, - color='green', marker='+', s=1, label='target') - ax.scatter(range(len(gate_outputs)), gate_outputs, alpha=0.5, - color='red', marker='.', s=1, label='predicted') - - plt.xlabel("Frames (Green target, Red predicted)") - plt.ylabel("Gate State") - plt.tight_layout() - - fig.canvas.draw() - data = save_figure_to_numpy(fig) - plt.close() - return data diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 539bb69..0000000 --- a/requirements.txt +++ /dev/null @@ -1,10 +0,0 @@ -tensorflow -numpy -inflect==0.2.5 -librosa==0.6.0 -scipy -tensorboardX -Unidecode==1.0.22 -pillow -torch==1.1.0 -pysoundfile diff --git a/requirements_dev.txt b/requirements_dev.txt deleted file mode 100644 index c5473c1..0000000 --- a/requirements_dev.txt +++ /dev/null @@ -1,15 +0,0 @@ -pip==18.1 -bumpversion==0.5.3 -wheel==0.32.1 -watchdog==0.9.0 -flake8==3.5.0 -tox==3.5.2 -coverage==4.5.1 -Sphinx==1.8.1 -twine==1.12.1 - -pytest==3.8.2 -pytest-runner==4.2 -pre-commit==1.16.1 -python-language-server[all] -ipdb diff --git a/server.py b/server.py deleted file mode 100644 index 7611e04..0000000 --- a/server.py +++ /dev/null @@ -1,39 +0,0 @@ -# -*- coding: utf-8 -*- -import grpc -import time -from sia.proto import tts_pb2 -from sia.proto import tts_pb2_grpc -from concurrent import futures -from sia.instruments import do_time -from tts import TTSModel - - -class TTSServer(): - def __init__(self): - self.tts_model = TTSModel() - - def TextToSpeechAPI(self, request, context): - while (True): - input_text = request.text - speech_response = self.tts_model.synth_speech(input_text) - return tts_pb2.SpeechResponse(response=speech_response) - - -def main(): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) - tts_server = TTSServer() - tts_pb2_grpc.add_ServerServicer_to_server(tts_server, server) - server.add_insecure_port('localhost:50060') - server.start() - print('TTSServer started!') - - try: - while True: - time.sleep(10000) - except KeyboardInterrupt: - server.start() - # server.stop(0) - - -if __name__ == "__main__": - main() diff --git a/stft.py b/stft.py deleted file mode 100644 index 70de3b7..0000000 --- a/stft.py +++ /dev/null @@ -1,141 +0,0 @@ -""" -BSD 3-Clause License - -Copyright (c) 2017, Prem Seetharaman -All rights reserved. - -* Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, this - list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - -* Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from this - software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -""" - -import torch -import numpy as np -import torch.nn.functional as F -from torch.autograd import Variable -from scipy.signal import get_window -from librosa.util import pad_center, tiny -from audio_processing import window_sumsquare - - -class STFT(torch.nn.Module): - """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" - def __init__(self, filter_length=800, hop_length=200, win_length=800, - window='hann'): - super(STFT, self).__init__() - self.filter_length = filter_length - self.hop_length = hop_length - self.win_length = win_length - self.window = window - self.forward_transform = None - scale = self.filter_length / self.hop_length - fourier_basis = np.fft.fft(np.eye(self.filter_length)) - - cutoff = int((self.filter_length / 2 + 1)) - fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), - np.imag(fourier_basis[:cutoff, :])]) - - forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) - inverse_basis = torch.FloatTensor( - np.linalg.pinv(scale * fourier_basis).T[:, None, :]) - - if window is not None: - assert(filter_length >= win_length) - # get window and zero center pad it to filter_length - fft_window = get_window(window, win_length, fftbins=True) - fft_window = pad_center(fft_window, filter_length) - fft_window = torch.from_numpy(fft_window).float() - - # window the bases - forward_basis *= fft_window - inverse_basis *= fft_window - - self.register_buffer('forward_basis', forward_basis.float()) - self.register_buffer('inverse_basis', inverse_basis.float()) - - def transform(self, input_data): - num_batches = input_data.size(0) - num_samples = input_data.size(1) - - self.num_samples = num_samples - - # similar to librosa, reflect-pad the input - input_data = input_data.view(num_batches, 1, num_samples) - input_data = F.pad( - input_data.unsqueeze(1), - (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), - mode='reflect') - input_data = input_data.squeeze(1) - - forward_transform = F.conv1d( - input_data, - Variable(self.forward_basis, requires_grad=False), - stride=self.hop_length, - padding=0) - - cutoff = int((self.filter_length / 2) + 1) - real_part = forward_transform[:, :cutoff, :] - imag_part = forward_transform[:, cutoff:, :] - - magnitude = torch.sqrt(real_part**2 + imag_part**2) - phase = torch.autograd.Variable( - torch.atan2(imag_part.data, real_part.data)) - - return magnitude, phase - - def inverse(self, magnitude, phase): - recombine_magnitude_phase = torch.cat( - [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) - - inverse_transform = F.conv_transpose1d( - recombine_magnitude_phase, - Variable(self.inverse_basis, requires_grad=False), - stride=self.hop_length, - padding=0) - - if self.window is not None: - window_sum = window_sumsquare( - self.window, magnitude.size(-1), hop_length=self.hop_length, - win_length=self.win_length, n_fft=self.filter_length, - dtype=np.float32) - # remove modulation effects - approx_nonzero_indices = torch.from_numpy( - np.where(window_sum > tiny(window_sum))[0]) - window_sum = torch.autograd.Variable( - torch.from_numpy(window_sum), requires_grad=False) - #window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum #initially not commented out - inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] - - # scale by hop ratio - inverse_transform *= float(self.filter_length) / self.hop_length - - inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] - inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):] - - return inverse_transform - - def forward(self, input_data): - self.magnitude, self.phase = self.transform(input_data) - reconstruction = self.inverse(self.magnitude, self.phase) - return reconstruction diff --git a/text/__init__.py b/text/__init__.py index 02ecf0e..a2dc0a6 100644 --- a/text/__init__.py +++ b/text/__init__.py @@ -1,22 +1,23 @@ +# -*- coding: utf-8 -*- """ from https://github.com/keithito/tacotron """ import re -from text import cleaners -from text.symbols import symbols - +from . import cleaners +from .symbols import symbols # Mappings from symbol to numeric ID and vice versa: _symbol_to_id = {s: i for i, s in enumerate(symbols)} _id_to_symbol = {i: s for i, s in enumerate(symbols)} # Regular expression matching text enclosed in curly braces: -_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') +_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)") def text_to_sequence(text, cleaner_names): - '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. + """Converts a string of text to a sequence of IDs corresponding to the + symbols in the text. - The text can optionally have ARPAbet sequences enclosed in curly braces embedded - in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." + The text can optionally have ARPAbet sequences enclosed in curly braces + embedded in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." Args: text: string to convert to a sequence @@ -24,51 +25,53 @@ def text_to_sequence(text, cleaner_names): Returns: List of integers corresponding to the symbols in the text - ''' - sequence = [] + """ + sequence = [] - # Check for curly braces and treat their contents as ARPAbet: - while len(text): - m = _curly_re.match(text) - if not m: - sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) - break - sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) - sequence += _arpabet_to_sequence(m.group(2)) - text = m.group(3) + # Check for curly braces and treat their contents as ARPAbet: + while len(text): + m = _curly_re.match(text) + if not m: + sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) + break + sequence += _symbols_to_sequence( + _clean_text(m.group(1), cleaner_names) + ) + sequence += _arpabet_to_sequence(m.group(2)) + text = m.group(3) - return sequence + return sequence def sequence_to_text(sequence): - '''Converts a sequence of IDs back to a string''' - result = '' - for symbol_id in sequence: - if symbol_id in _id_to_symbol: - s = _id_to_symbol[symbol_id] - # Enclose ARPAbet back in curly braces: - if len(s) > 1 and s[0] == '@': - s = '{%s}' % s[1:] - result += s - return result.replace('}{', ' ') + """Converts a sequence of IDs back to a string""" + result = "" + for symbol_id in sequence: + if symbol_id in _id_to_symbol: + s = _id_to_symbol[symbol_id] + # Enclose ARPAbet back in curly braces: + if len(s) > 1 and s[0] == "@": + s = "{%s}" % s[1:] + result += s + return result.replace("}{", " ") def _clean_text(text, cleaner_names): - for name in cleaner_names: - cleaner = getattr(cleaners, name) - if not cleaner: - raise Exception('Unknown cleaner: %s' % name) - text = cleaner(text) - return text + for name in cleaner_names: + cleaner = getattr(cleaners, name) + if not cleaner: + raise Exception("Unknown cleaner: %s" % name) + text = cleaner(text) + return text def _symbols_to_sequence(symbols): - return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] + return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] def _arpabet_to_sequence(text): - return _symbols_to_sequence(['@' + s for s in text.split()]) + return _symbols_to_sequence(["@" + s for s in text.split()]) def _should_keep_symbol(s): - return s in _symbol_to_id and s is not '_' and s is not '~' + return s in _symbol_to_id and s != "_" and s != "~" diff --git a/text/cleaners.py b/text/cleaners.py index e2e35c1..7513329 100644 --- a/text/cleaners.py +++ b/text/cleaners.py @@ -1,90 +1,99 @@ -""" from https://github.com/keithito/tacotron """ - -''' -Cleaners are transformations that run over the input text at both training and eval time. - -Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" -hyperparameter. Some cleaners are English-specific. You'll typically want to use: - 1. "english_cleaners" for English text - 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using - the Unidecode library (https://pypi.python.org/pypi/Unidecode) - 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update - the symbols in symbols.py to match your data). -''' - +# -*- coding: utf-8 -*- import re from unidecode import unidecode from .numbers import normalize_numbers +""" from https://github.com/keithito/tacotron """ +""" +Cleaners are transformations that run over the input text at both training and +eval time. + +Cleaners can be selected by passing a comma-delimited list of cleaner names as +the "cleaners" +hyperparameter. Some cleaners are English-specific. You'll typically want to +use: + 1. "english_cleaners" for English text + 2. "transliteration_cleaners" for non-English text that can be transliterated + to ASCII using + the Unidecode library (https://pypi.python.org/pypi/Unidecode) + 3. "basic_cleaners" if you do not want to transliterate (in this case, you + should also update + the symbols in symbols.py to match your data). +""" # Regular expression matching whitespace: -_whitespace_re = re.compile(r'\s+') +_whitespace_re = re.compile(r"\s+") # List of (regular expression, replacement) pairs for abbreviations: -_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ - ('mrs', 'misess'), - ('mr', 'mister'), - ('dr', 'doctor'), - ('st', 'saint'), - ('co', 'company'), - ('jr', 'junior'), - ('maj', 'major'), - ('gen', 'general'), - ('drs', 'doctors'), - ('rev', 'reverend'), - ('lt', 'lieutenant'), - ('hon', 'honorable'), - ('sgt', 'sergeant'), - ('capt', 'captain'), - ('esq', 'esquire'), - ('ltd', 'limited'), - ('col', 'colonel'), - ('ft', 'fort'), -]] +_abbreviations = [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + ("mrs", "misess"), + ("mr", "mister"), + ("dr", "doctor"), + ("st", "saint"), + ("co", "company"), + ("jr", "junior"), + ("maj", "major"), + ("gen", "general"), + ("drs", "doctors"), + ("rev", "reverend"), + ("lt", "lieutenant"), + ("hon", "honorable"), + ("sgt", "sergeant"), + ("capt", "captain"), + ("esq", "esquire"), + ("ltd", "limited"), + ("col", "colonel"), + ("ft", "fort"), + ] +] def expand_abbreviations(text): - for regex, replacement in _abbreviations: - text = re.sub(regex, replacement, text) - return text + for regex, replacement in _abbreviations: + text = re.sub(regex, replacement, text) + return text def expand_numbers(text): - return normalize_numbers(text) + return normalize_numbers(text) def lowercase(text): - return text.lower() + return text.lower() def collapse_whitespace(text): - return re.sub(_whitespace_re, ' ', text) + return re.sub(_whitespace_re, " ", text) def convert_to_ascii(text): - return unidecode(text) + return unidecode(text) def basic_cleaners(text): - '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' - text = lowercase(text) - text = collapse_whitespace(text) - return text + """Basic pipeline that lowercases and collapses whitespace without + transliteration.""" + text = lowercase(text) + text = collapse_whitespace(text) + return text def transliteration_cleaners(text): - '''Pipeline for non-English text that transliterates to ASCII.''' - text = convert_to_ascii(text) - text = lowercase(text) - text = collapse_whitespace(text) - return text + """Pipeline for non-English text that transliterates to ASCII.""" + text = convert_to_ascii(text) + text = lowercase(text) + text = collapse_whitespace(text) + return text def english_cleaners(text): - '''Pipeline for English text, including number and abbreviation expansion.''' - text = convert_to_ascii(text) - text = lowercase(text) - text = expand_numbers(text) - text = expand_abbreviations(text) - text = collapse_whitespace(text) - return text + """Pipeline for English text, including number and abbreviation + expansion.""" + text = convert_to_ascii(text) + text = lowercase(text) + text = expand_numbers(text) + text = expand_abbreviations(text) + text = collapse_whitespace(text) + return text diff --git a/text/cmudict.py b/text/cmudict.py index 62bfef7..c111cda 100644 --- a/text/cmudict.py +++ b/text/cmudict.py @@ -1,65 +1,143 @@ +# -*- coding: utf-8 -*- """ from https://github.com/keithito/tacotron """ import re - valid_symbols = [ - 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', - 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', - 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', - 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', - 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', - 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', - 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' + "AA", + "AA0", + "AA1", + "AA2", + "AE", + "AE0", + "AE1", + "AE2", + "AH", + "AH0", + "AH1", + "AH2", + "AO", + "AO0", + "AO1", + "AO2", + "AW", + "AW0", + "AW1", + "AW2", + "AY", + "AY0", + "AY1", + "AY2", + "B", + "CH", + "D", + "DH", + "EH", + "EH0", + "EH1", + "EH2", + "ER", + "ER0", + "ER1", + "ER2", + "EY", + "EY0", + "EY1", + "EY2", + "F", + "G", + "HH", + "IH", + "IH0", + "IH1", + "IH2", + "IY", + "IY0", + "IY1", + "IY2", + "JH", + "K", + "L", + "M", + "N", + "NG", + "OW", + "OW0", + "OW1", + "OW2", + "OY", + "OY0", + "OY1", + "OY2", + "P", + "R", + "S", + "SH", + "T", + "TH", + "UH", + "UH0", + "UH1", + "UH2", + "UW", + "UW0", + "UW1", + "UW2", + "V", + "W", + "Y", + "Z", + "ZH", ] _valid_symbol_set = set(valid_symbols) class CMUDict: - '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' - def __init__(self, file_or_path, keep_ambiguous=True): - if isinstance(file_or_path, str): - with open(file_or_path, encoding='latin-1') as f: - entries = _parse_cmudict(f) - else: - entries = _parse_cmudict(file_or_path) - if not keep_ambiguous: - entries = {word: pron for word, pron in entries.items() if len(pron) == 1} - self._entries = entries + """Thin wrapper around CMUDict data. + http://www.speech.cs.cmu.edu/cgi-bin/cmudict""" + + def __init__(self, file_or_path, keep_ambiguous=True): + if isinstance(file_or_path, str): + with open(file_or_path, encoding="latin-1") as f: + entries = _parse_cmudict(f) + else: + entries = _parse_cmudict(file_or_path) + if not keep_ambiguous: + entries = { + word: pron for word, pron in entries.items() if len(pron) == 1 + } + self._entries = entries + + def __len__(self): + return len(self._entries) + + def lookup(self, word): + """Returns list of ARPAbet pronunciations of the given word.""" + return self._entries.get(word.upper()) - def __len__(self): - return len(self._entries) - - - def lookup(self, word): - '''Returns list of ARPAbet pronunciations of the given word.''' - return self._entries.get(word.upper()) - - - -_alt_re = re.compile(r'\([0-9]+\)') +_alt_re = re.compile(r"\([0-9]+\)") def _parse_cmudict(file): - cmudict = {} - for line in file: - if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): - parts = line.split(' ') - word = re.sub(_alt_re, '', parts[0]) - pronunciation = _get_pronunciation(parts[1]) - if pronunciation: - if word in cmudict: - cmudict[word].append(pronunciation) - else: - cmudict[word] = [pronunciation] - return cmudict + cmudict = {} + for line in file: + if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"): + parts = line.split(" ") + word = re.sub(_alt_re, "", parts[0]) + pronunciation = _get_pronunciation(parts[1]) + if pronunciation: + if word in cmudict: + cmudict[word].append(pronunciation) + else: + cmudict[word] = [pronunciation] + return cmudict def _get_pronunciation(s): - parts = s.strip().split(' ') - for part in parts: - if part not in _valid_symbol_set: - return None - return ' '.join(parts) + parts = s.strip().split(" ") + for part in parts: + if part not in _valid_symbol_set: + return None + return " ".join(parts) diff --git a/text/numbers.py b/text/numbers.py index 0d5f7fa..0c87c62 100644 --- a/text/numbers.py +++ b/text/numbers.py @@ -1,71 +1,73 @@ +# -*- coding: utf-8 -*- """ from https://github.com/keithito/tacotron """ import inflect import re - _inflect = inflect.engine() -_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') -_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') -_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') -_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') -_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') -_number_re = re.compile(r'[0-9]+') +_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])") +_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)") +_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)") +_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)") +_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)") +_number_re = re.compile(r"[0-9]+") def _remove_commas(m): - return m.group(1).replace(',', '') + return m.group(1).replace(",", "") def _expand_decimal_point(m): - return m.group(1).replace('.', ' point ') + return m.group(1).replace(".", " point ") def _expand_dollars(m): - match = m.group(1) - parts = match.split('.') - if len(parts) > 2: - return match + ' dollars' # Unexpected format - dollars = int(parts[0]) if parts[0] else 0 - cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 - if dollars and cents: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) - elif dollars: - dollar_unit = 'dollar' if dollars == 1 else 'dollars' - return '%s %s' % (dollars, dollar_unit) - elif cents: - cent_unit = 'cent' if cents == 1 else 'cents' - return '%s %s' % (cents, cent_unit) - else: - return 'zero dollars' + match = m.group(1) + parts = match.split(".") + if len(parts) > 2: + return match + " dollars" # Unexpected format + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = "dollar" if dollars == 1 else "dollars" + cent_unit = "cent" if cents == 1 else "cents" + return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit) + elif dollars: + dollar_unit = "dollar" if dollars == 1 else "dollars" + return "%s %s" % (dollars, dollar_unit) + elif cents: + cent_unit = "cent" if cents == 1 else "cents" + return "%s %s" % (cents, cent_unit) + else: + return "zero dollars" def _expand_ordinal(m): - return _inflect.number_to_words(m.group(0)) + return _inflect.number_to_words(m.group(0)) def _expand_number(m): - num = int(m.group(0)) - if num > 1000 and num < 3000: - if num == 2000: - return 'two thousand' - elif num > 2000 and num < 2010: - return 'two thousand ' + _inflect.number_to_words(num % 100) - elif num % 100 == 0: - return _inflect.number_to_words(num // 100) + ' hundred' + num = int(m.group(0)) + if num > 1000 and num < 3000: + if num == 2000: + return "two thousand" + elif num > 2000 and num < 2010: + return "two thousand " + _inflect.number_to_words(num % 100) + elif num % 100 == 0: + return _inflect.number_to_words(num // 100) + " hundred" + else: + return _inflect.number_to_words( + num, andword="", zero="oh", group=2 + ).replace(", ", " ") else: - return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') - else: - return _inflect.number_to_words(num, andword='') + return _inflect.number_to_words(num, andword="") def normalize_numbers(text): - text = re.sub(_comma_number_re, _remove_commas, text) - text = re.sub(_pounds_re, r'\1 pounds', text) - text = re.sub(_dollars_re, _expand_dollars, text) - text = re.sub(_decimal_number_re, _expand_decimal_point, text) - text = re.sub(_ordinal_re, _expand_ordinal, text) - text = re.sub(_number_re, _expand_number, text) - return text + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_pounds_re, r"\1 pounds", text) + text = re.sub(_dollars_re, _expand_dollars, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + text = re.sub(_number_re, _expand_number, text) + return text diff --git a/text/symbols.py b/text/symbols.py index 1be47bf..c440aea 100644 --- a/text/symbols.py +++ b/text/symbols.py @@ -1,18 +1,24 @@ -""" from https://github.com/keithito/tacotron """ +# -*- coding: utf-8 -*- +from . import cmudict -''' +""" from https://github.com/keithito/tacotron """ +""" Defines the set of symbols used in text input to the model. -The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' -from text import cmudict +The default is a set of ASCII characters that works well for English or text +that has been run through Unidecode. For other data, you can modify +_characters. See TRAINING_DATA.md for details. """ -_pad = '_' -_punctuation = '!\'(),.:;? ' -_special = '-' -_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' +_pad = "_" +_punctuation = "!'(),.:;? " +_special = "-" +_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" -# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): -_arpabet = ['@' + s for s in cmudict.valid_symbols] +# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as +# uppercase letters): +_arpabet = ["@" + s for s in cmudict.valid_symbols] # Export all symbols: -symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet +symbols = ( + [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet +) diff --git a/train.py b/train.py deleted file mode 100644 index c7374c7..0000000 --- a/train.py +++ /dev/null @@ -1,290 +0,0 @@ -import os -import time -import argparse -import math -from numpy import finfo - -import torch -from distributed import apply_gradient_allreduce -import torch.distributed as dist -from torch.utils.data.distributed import DistributedSampler -from torch.utils.data import DataLoader - -from model import Tacotron2 -from data_utils import TextMelLoader, TextMelCollate -from loss_function import Tacotron2Loss -from logger import Tacotron2Logger -from hparams import create_hparams - - -def reduce_tensor(tensor, n_gpus): - rt = tensor.clone() - dist.all_reduce(rt, op=dist.reduce_op.SUM) - rt /= n_gpus - return rt - - -def init_distributed(hparams, n_gpus, rank, group_name): - assert torch.cuda.is_available(), "Distributed mode requires CUDA." - print("Initializing Distributed") - - # Set cuda device so everything is done on the right GPU. - torch.cuda.set_device(rank % torch.cuda.device_count()) - - # Initialize distributed communication - dist.init_process_group( - backend=hparams.dist_backend, init_method=hparams.dist_url, - world_size=n_gpus, rank=rank, group_name=group_name) - - print("Done initializing distributed") - - -def prepare_dataloaders(hparams): - # Get data, data loaders and collate function ready - trainset = TextMelLoader(hparams.training_files, hparams) - valset = TextMelLoader(hparams.validation_files, hparams) - collate_fn = TextMelCollate(hparams.n_frames_per_step) - - if hparams.distributed_run: - train_sampler = DistributedSampler(trainset) - shuffle = False - else: - train_sampler = None - shuffle = True - - train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle, - sampler=train_sampler, - batch_size=hparams.batch_size, pin_memory=False, - drop_last=True, collate_fn=collate_fn) - return train_loader, valset, collate_fn - - -def prepare_directories_and_logger(output_directory, log_directory, rank): - if rank == 0: - if not os.path.isdir(output_directory): - os.makedirs(output_directory) - os.chmod(output_directory, 0o775) - logger = Tacotron2Logger(os.path.join(output_directory, log_directory)) - else: - logger = None - return logger - - -def load_model(hparams): - model = Tacotron2(hparams) - if hparams.fp16_run: - model.decoder.attention_layer.score_mask_value = finfo('float16').min - - if hparams.distributed_run: - model = apply_gradient_allreduce(model) - - return model - - -def warm_start_model(checkpoint_path, model, ignore_layers): - assert os.path.isfile(checkpoint_path) - print("Warm starting model from checkpoint '{}'".format(checkpoint_path)) - checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') - model_dict = checkpoint_dict['state_dict'] - if len(ignore_layers) > 0: - model_dict = {k: v for k, v in model_dict.items() - if k not in ignore_layers} - dummy_dict = model.state_dict() - dummy_dict.update(model_dict) - model_dict = dummy_dict - model.load_state_dict(model_dict) - return model - - -def load_checkpoint(checkpoint_path, model, optimizer): - assert os.path.isfile(checkpoint_path) - print("Loading checkpoint '{}'".format(checkpoint_path)) - checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') - model.load_state_dict(checkpoint_dict['state_dict']) - optimizer.load_state_dict(checkpoint_dict['optimizer']) - learning_rate = checkpoint_dict['learning_rate'] - iteration = checkpoint_dict['iteration'] - print("Loaded checkpoint '{}' from iteration {}" .format( - checkpoint_path, iteration)) - return model, optimizer, learning_rate, iteration - - -def save_checkpoint(model, optimizer, learning_rate, iteration, filepath): - print("Saving model and optimizer state at iteration {} to {}".format( - iteration, filepath)) - torch.save({'iteration': iteration, - 'state_dict': model.state_dict(), - 'optimizer': optimizer.state_dict(), - 'learning_rate': learning_rate}, filepath) - - -def validate(model, criterion, valset, iteration, batch_size, n_gpus, - collate_fn, logger, distributed_run, rank): - """Handles all the validation scoring and printing""" - model.eval() - with torch.no_grad(): - val_sampler = DistributedSampler(valset) if distributed_run else None - val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1, - shuffle=False, batch_size=batch_size, - pin_memory=False, collate_fn=collate_fn) - - val_loss = 0.0 - for i, batch in enumerate(val_loader): - x, y = model.parse_batch(batch) - y_pred = model(x) - loss = criterion(y_pred, y) - if distributed_run: - reduced_val_loss = reduce_tensor(loss.data, n_gpus).item() - else: - reduced_val_loss = loss.item() - val_loss += reduced_val_loss - val_loss = val_loss / (i + 1) - - model.train() - if rank == 0: - print("Validation loss {}: {:9f} ".format(iteration, reduced_val_loss)) - logger.log_validation(reduced_val_loss, model, y, y_pred, iteration) - - -def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, - rank, group_name, hparams): - """Training and validation logging results to tensorboard and stdout - - Params - ------ - output_directory (string): directory to save checkpoints - log_directory (string) directory to save tensorboard logs - checkpoint_path(string): checkpoint path - n_gpus (int): number of gpus - rank (int): rank of current gpu - hparams (object): comma separated list of "name=value" pairs. - """ - if hparams.distributed_run: - init_distributed(hparams, n_gpus, rank, group_name) - - torch.manual_seed(hparams.seed) - torch.cuda.manual_seed(hparams.seed) - - model = load_model(hparams) - learning_rate = hparams.learning_rate - optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, - weight_decay=hparams.weight_decay) - - if hparams.fp16_run: - from apex import amp - model, optimizer = amp.initialize( - model, optimizer, opt_level='O2') - - if hparams.distributed_run: - model = apply_gradient_allreduce(model) - - criterion = Tacotron2Loss() - - logger = prepare_directories_and_logger( - output_directory, log_directory, rank) - - train_loader, valset, collate_fn = prepare_dataloaders(hparams) - - # Load checkpoint if one exists - iteration = 0 - epoch_offset = 0 - if checkpoint_path is not None: - if warm_start: - model = warm_start_model( - checkpoint_path, model, hparams.ignore_layers) - else: - model, optimizer, _learning_rate, iteration = load_checkpoint( - checkpoint_path, model, optimizer) - if hparams.use_saved_learning_rate: - learning_rate = _learning_rate - iteration += 1 # next iteration is iteration + 1 - epoch_offset = max(0, int(iteration / len(train_loader))) - - model.train() - is_overflow = False - # ================ MAIN TRAINNIG LOOP! =================== - for epoch in range(epoch_offset, hparams.epochs): - print("Epoch: {}".format(epoch)) - for i, batch in enumerate(train_loader): - start = time.perf_counter() - for param_group in optimizer.param_groups: - param_group['lr'] = learning_rate - - model.zero_grad() - x, y = model.parse_batch(batch) - y_pred = model(x) - - loss = criterion(y_pred, y) - if hparams.distributed_run: - reduced_loss = reduce_tensor(loss.data, n_gpus).item() - else: - reduced_loss = loss.item() - if hparams.fp16_run: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() - - if hparams.fp16_run: - grad_norm = torch.nn.utils.clip_grad_norm_( - amp.master_params(optimizer), hparams.grad_clip_thresh) - is_overflow = math.isnan(grad_norm) - else: - grad_norm = torch.nn.utils.clip_grad_norm_( - model.parameters(), hparams.grad_clip_thresh) - - optimizer.step() - - if not is_overflow and rank == 0: - duration = time.perf_counter() - start - print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( - iteration, reduced_loss, grad_norm, duration)) - logger.log_training( - reduced_loss, grad_norm, learning_rate, duration, iteration) - - if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): - validate(model, criterion, valset, iteration, - hparams.batch_size, n_gpus, collate_fn, logger, - hparams.distributed_run, rank) - if rank == 0: - checkpoint_path = os.path.join( - output_directory, "checkpoint_{}".format(iteration)) - save_checkpoint(model, optimizer, learning_rate, iteration, - checkpoint_path) - - iteration += 1 - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-o', '--output_directory', type=str, - help='directory to save checkpoints') - parser.add_argument('-l', '--log_directory', type=str, - help='directory to save tensorboard logs') - parser.add_argument('-c', '--checkpoint_path', type=str, default=None, - required=False, help='checkpoint path') - parser.add_argument('--warm_start', action='store_true', - help='load model weights only, ignore specified layers') - parser.add_argument('--n_gpus', type=int, default=1, - required=False, help='number of gpus') - parser.add_argument('--rank', type=int, default=0, - required=False, help='rank of current gpu') - parser.add_argument('--group_name', type=str, default='group_name', - required=False, help='Distributed group name') - parser.add_argument('--hparams', type=str, - required=False, help='comma separated name=value pairs') - - args = parser.parse_args() - hparams = create_hparams(args.hparams) - - torch.backends.cudnn.enabled = hparams.cudnn_enabled - torch.backends.cudnn.benchmark = hparams.cudnn_benchmark - - print("FP16 Run:", hparams.fp16_run) - print("Dynamic Loss Scaling:", hparams.dynamic_loss_scaling) - print("Distributed Run:", hparams.distributed_run) - print("cuDNN Enabled:", hparams.cudnn_enabled) - print("cuDNN Benchmark:", hparams.cudnn_benchmark) - - train(args.output_directory, args.log_directory, args.checkpoint_path, - args.warm_start, args.n_gpus, args.rank, args.group_name, hparams) diff --git a/tts.py b/tts.py deleted file mode 100644 index f115d83..0000000 --- a/tts.py +++ /dev/null @@ -1,177 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -import sys -import numpy as np -import torch -from hparams import create_hparams -from model import Tacotron2 -from train import load_model -from text import text_to_sequence -import os -import soundfile as sf -import pyaudio -import klepto -from librosa import resample -from librosa.effects import time_stretch -from sia.file_utils import cached_model_path -from sia.instruments import do_time -from glow import WaveGlow - -TTS_SAMPLE_RATE = 22050 -OUTPUT_SAMPLE_RATE = 16000 - -# https://github.com/NVIDIA/waveglow/blob/master/config.json -WAVEGLOW_CONFIG = { - "n_mel_channels": 80, - "n_flows": 12, - "n_group": 8, - "n_early_every": 4, - "n_early_size": 2, - "WN_config": { - "n_layers": 8, - "n_channels": 256, - "kernel_size": 3 - } -} - - -class TTSModel(object): - """docstring for TTSModel.""" - - def __init__(self): - super(TTSModel, self).__init__() - hparams = create_hparams() - hparams.sampling_rate = TTS_SAMPLE_RATE - self.model = load_model(hparams) - tacotron2_path = cached_model_path("tacotron2_model") - self.model.load_state_dict( - torch.load(tacotron2_path, map_location='cpu')['state_dict']) - self.model.eval() - waveglow_path = cached_model_path('waveglow_model') - self.waveglow = WaveGlow(**WAVEGLOW_CONFIG) - wave_params = torch.load(waveglow_path, map_location='cpu') - self.waveglow.load_state_dict(wave_params) - self.waveglow.eval() - for k in self.waveglow.convinv: - k.float() - self.k_cache = klepto.archives.file_archive(cached=False) - self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)( - self.synth_speech) - - # https://github.com/NVIDIA/waveglow/issues/127 - for m in self.waveglow.modules(): - if 'Conv' in str(type(m)): - setattr(m, 'padding_mode', 'zeros') - - @do_time - def synth_speech(self, t): - text = t - sequence = np.array(text_to_sequence(text, - ['english_cleaners']))[None, :] - sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long() - mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference( - sequence) - with torch.no_grad(): - audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666) - audio = audio_t[0].data.cpu().numpy() - # data = convert(audio) - slow_data = time_stretch(audio, 0.8) - float_data = resample(slow_data, TTS_SAMPLE_RATE, OUTPUT_SAMPLE_RATE) - data = float2pcm(float_data) - return data.tobytes() - - -def convert(array): - sf.write('sample.wav', array, TTS_SAMPLE_RATE) - # convert to $OUTPUT_SAMPLE_RATE - os.system('ffmpeg -i {0} -filter:a "atempo=0.80" -ar 16k {1}'.format( - 'sample.wav', 'sample0.wav')) - data, rate = sf.read('sample0.wav', dtype='int16') - os.remove('sample.wav') - os.remove('sample0.wav') - return data - - -# https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py -def float2pcm(sig, dtype='int16'): - """Convert floating point signal with a range from -1 to 1 to PCM. - Any signal values outside the interval [-1.0, 1.0) are clipped. - No dithering is used. - Note that there are different possibilities for scaling floating - point numbers to PCM numbers, this function implements just one of - them. For an overview of alternatives see - http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html - Parameters - ---------- - sig : array_like - Input array, must have floating point type. - dtype : data type, optional - Desired (integer) data type. - Returns - ------- - numpy.ndarray - Integer data, scaled and clipped to the range of the given - *dtype*. - See Also - -------- - pcm2float, dtype - """ - sig = np.asarray(sig) - if sig.dtype.kind != 'f': - raise TypeError("'sig' must be a float array") - dtype = np.dtype(dtype) - if dtype.kind not in 'iu': - raise TypeError("'dtype' must be an integer type") - - i = np.iinfo(dtype) - abs_max = 2**(i.bits - 1) - offset = i.min + abs_max - return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype) - - -def display(data): - import IPython.display as ipd - aud = ipd.Audio(data, rate=16000) - return aud - - -def player_gen(): - audio_interface = pyaudio.PyAudio() - _audio_stream = audio_interface.open(format=pyaudio.paInt16, - channels=1, - rate=OUTPUT_SAMPLE_RATE, - output=True) - - def play_device(data): - _audio_stream.write(data) - # _audio_stream.close() - - return play_device - - -def synthesize_corpus(): - tts_model = TTSModel() - all_data = [] - for (i, line) in enumerate(open('corpus.txt').readlines()): - print('synthesizing... "{}"'.format(line.strip())) - data = tts_model.synth_speech(line.strip()) - all_data.append(data) - return all_data - - -def play_corpus(corpus_synths): - player = player_gen() - for d in corpus_synths: - player(d) - - -def main(): - corpus_synth_data = synthesize_corpus() - play_corpus(corpus_synth_data) - import ipdb - ipdb.set_trace() - - -if __name__ == '__main__': - main() diff --git a/utils.py b/utils.py deleted file mode 100644 index 8edcbcd..0000000 --- a/utils.py +++ /dev/null @@ -1,29 +0,0 @@ -import numpy as np -from scipy.io.wavfile import read -import torch - - -def get_mask_from_lengths(lengths): - max_len = torch.max(lengths).item() - ids = torch.arange(0, max_len, out=torch.LongTensor(max_len)) #initially out = torch.LongTensor(max_len) - mask = (ids < lengths.unsqueeze(1)).byte() - return mask - - -def load_wav_to_torch(full_path): - sampling_rate, data = read(full_path) - return torch.FloatTensor(data.astype(np.float32)), sampling_rate - - -def load_filepaths_and_text(filename, split="|"): - with open(filename, encoding='utf-8') as f: - filepaths_and_text = [line.strip().split(split) for line in f] - return filepaths_and_text - - -def to_gpu(x): - x = x.contiguous() - - #if torch.cuda.is_available(): #initially not commented out - # x = x.cuda(non_blocking=True) # initially not commented out - return torch.autograd.Variable(x)