diff --git a/audio_processing.py b/audio_processing.py new file mode 100644 index 0000000..39d4d3f --- /dev/null +++ b/audio_processing.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- +import torch +import numpy as np +from scipy.signal import get_window +import librosa.util as librosa_util + + +def window_sumsquare( + window, + n_frames, + hop_length=200, + win_length=800, + n_fft=800, + dtype=np.float32, + norm=None, +): + """ + # from librosa 0.6 + Compute the sum-square envelope of a window function at a given hop length. + + This is used to estimate modulation effects induced by windowing + observations in short-time fourier transforms. + + Parameters + ---------- + window : string, tuple, number, callable, or list-like + Window specification, as in `get_window` + + n_frames : int > 0 + The number of analysis frames + + hop_length : int > 0 + The number of samples to advance between frames + + win_length : [optional] + The length of the window function. By default, this matches `n_fft`. + + n_fft : int > 0 + The length of each analysis frame. + + dtype : np.dtype + The data type of the output + + Returns + ------- + wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` + The sum-squared envelope of the window function + """ + if win_length is None: + win_length = n_fft + + n = n_fft + hop_length * (n_frames - 1) + x = np.zeros(n, dtype=dtype) + + # Compute the squared window at the desired length + win_sq = get_window(window, win_length, fftbins=True) + win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2 + win_sq = librosa_util.pad_center(win_sq, n_fft) + + # Fill the envelope + for i in range(n_frames): + sample = i * hop_length + x[sample : min(n, sample + n_fft)] += win_sq[ + : max(0, min(n_fft, n - sample)) + ] + return x + + +def griffin_lim(magnitudes, stft_fn, n_iters=30): + """ + PARAMS + ------ + magnitudes: spectrogram magnitudes + stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods + """ + + angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) + angles = angles.astype(np.float32) + angles = torch.autograd.Variable(torch.from_numpy(angles)) + signal = stft_fn.inverse(magnitudes, angles).squeeze(1) + + for i in range(n_iters): + _, angles = stft_fn.transform(signal) + signal = stft_fn.inverse(magnitudes, angles).squeeze(1) + return signal + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + """ + PARAMS + ------ + C: compression factor + """ + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression(x, C=1): + """ + PARAMS + ------ + C: compression factor used to compress + """ + return torch.exp(x) / C diff --git a/data_utils.py b/data_utils.py new file mode 100644 index 0000000..b845d94 --- /dev/null +++ b/data_utils.py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- +import random +import numpy as np +import torch +import torch.utils.data + +from . import layers +from .utils import load_wav_to_torch, load_filepaths_and_text +from .text import text_to_sequence + + +class TextMelLoader(torch.utils.data.Dataset): + """ + 1) loads audio,text pairs + 2) normalizes text and converts them to sequences of one-hot vectors + 3) computes mel-spectrograms from audio files. + """ + + def __init__(self, audiopaths_and_text, hparams): + self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) + self.text_cleaners = hparams.text_cleaners + self.max_wav_value = hparams.max_wav_value + self.sampling_rate = hparams.sampling_rate + self.load_mel_from_disk = hparams.load_mel_from_disk + self.stft = layers.TacotronSTFT( + hparams.filter_length, + hparams.hop_length, + hparams.win_length, + hparams.n_mel_channels, + hparams.sampling_rate, + hparams.mel_fmin, + hparams.mel_fmax, + ) + random.seed(1234) + random.shuffle(self.audiopaths_and_text) + + def get_mel_text_pair(self, audiopath_and_text): + # separate filename and text + audiopath, text = audiopath_and_text[0], audiopath_and_text[1] + text = self.get_text(text) + mel = self.get_mel(audiopath) + return (text, mel) + + def get_mel(self, filename): + if not self.load_mel_from_disk: + audio, sampling_rate = load_wav_to_torch(filename) + if sampling_rate != self.stft.sampling_rate: + raise ValueError( + "{} {} SR doesn't match target {} SR".format( + sampling_rate, self.stft.sampling_rate + ) + ) + audio_norm = audio / self.max_wav_value + audio_norm = audio_norm.unsqueeze(0) + audio_norm = torch.autograd.Variable( + audio_norm, requires_grad=False + ) + melspec = self.stft.mel_spectrogram(audio_norm) + melspec = torch.squeeze(melspec, 0) + else: + melspec = torch.from_numpy(np.load(filename)) + assert ( + melspec.size(0) == self.stft.n_mel_channels + ), "Mel dimension mismatch: given {}, expected {}".format( + melspec.size(0), self.stft.n_mel_channels + ) + + return melspec + + def get_text(self, text): + text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners)) + return text_norm + + def __getitem__(self, index): + return self.get_mel_text_pair(self.audiopaths_and_text[index]) + + def __len__(self): + return len(self.audiopaths_and_text) + + +class TextMelCollate: + """ Zero-pads model inputs and targets based on number of frames per setep + """ + + def __init__(self, n_frames_per_step): + self.n_frames_per_step = n_frames_per_step + + def __call__(self, batch): + """Collate's training batch from normalized text and mel-spectrogram + PARAMS + ------ + batch: [text_normalized, mel_normalized] + """ + # Right zero-pad all one-hot text sequences to max input length + input_lengths, ids_sorted_decreasing = torch.sort( + torch.LongTensor([len(x[0]) for x in batch]), + dim=0, + descending=True, + ) + max_input_len = input_lengths[0] + + text_padded = torch.LongTensor(len(batch), max_input_len) + text_padded.zero_() + for i in range(len(ids_sorted_decreasing)): + text = batch[ids_sorted_decreasing[i]][0] + text_padded[i, : text.size(0)] = text + + # Right zero-pad mel-spec + num_mels = batch[0][1].size(0) + max_target_len = max([x[1].size(1) for x in batch]) + if max_target_len % self.n_frames_per_step != 0: + rest = max_target_len % self.n_frames_per_step + max_target_len += self.n_frames_per_step - rest + assert max_target_len % self.n_frames_per_step == 0 + + # include mel padded and gate padded + mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len) + mel_padded.zero_() + gate_padded = torch.FloatTensor(len(batch), max_target_len) + gate_padded.zero_() + output_lengths = torch.LongTensor(len(batch)) + for i in range(len(ids_sorted_decreasing)): + mel = batch[ids_sorted_decreasing[i]][1] + mel_padded[i, :, : mel.size(1)] = mel + gate_padded[i, mel.size(1) - 1 :] = 1 + output_lengths[i] = mel.size(1) + + return ( + text_padded, + input_lengths, + mel_padded, + gate_padded, + output_lengths, + ) diff --git a/demo_client.py b/demo_client.py index b132277..ba50344 100644 --- a/demo_client.py +++ b/demo_client.py @@ -7,7 +7,7 @@ from .tts import player_gen def tts_player(): player = player_gen() - channel = grpc.insecure_channel('localhost:50060') + channel = grpc.insecure_channel("localhost:50060") stub = tts_pb2_grpc.ServerStub(channel) def play(t): @@ -20,10 +20,11 @@ def tts_player(): def main(): play = tts_player() - play('How may I help you today?') + play("How may I help you today?") import pdb + pdb.set_trace() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/glow.py b/glow.py new file mode 100644 index 0000000..e6060c8 --- /dev/null +++ b/glow.py @@ -0,0 +1,349 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ***************************************************************************** +import torch +from torch.autograd import Variable +import torch.nn.functional as F + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.nn.functional.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.nn.functional.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +class WaveGlowLoss(torch.nn.Module): + def __init__(self, sigma=1.0): + super(WaveGlowLoss, self).__init__() + self.sigma = sigma + + def forward(self, model_output): + z, log_s_list, log_det_W_list = model_output + for i, log_s in enumerate(log_s_list): + if i == 0: + log_s_total = torch.sum(log_s) + log_det_W_total = log_det_W_list[i] + else: + log_s_total = log_s_total + torch.sum(log_s) + log_det_W_total += log_det_W_list[i] + + loss = ( + torch.sum(z * z) / (2 * self.sigma * self.sigma) + - log_s_total + - log_det_W_total + ) + return loss / (z.size(0) * z.size(1) * z.size(2)) + + +class Invertible1x1Conv(torch.nn.Module): + """ + The layer outputs both the convolution, and the log determinant + of its weight matrix. If reverse=True it does convolution with + inverse + """ + + def __init__(self, c): + super(Invertible1x1Conv, self).__init__() + self.conv = torch.nn.Conv1d( + c, c, kernel_size=1, stride=1, padding=0, bias=False + ) + + # Sample a random orthonormal matrix to initialize weights + W = torch.qr(torch.FloatTensor(c, c).normal_())[0] + + # Ensure determinant is 1.0 not -1.0 + if torch.det(W) < 0: + W[:, 0] = -1 * W[:, 0] + W = W.view(c, c, 1) + self.conv.weight.data = W + + def forward(self, z, reverse=False): + # shape + batch_size, group_size, n_of_groups = z.size() + + W = self.conv.weight.squeeze() + + if reverse: + if not hasattr(self, "W_inverse"): + # Reverse computation + W_inverse = W.inverse() + W_inverse = Variable(W_inverse[..., None]) + if z.type() == "torch.cuda.HalfTensor": + W_inverse = W_inverse.half() + self.W_inverse = W_inverse + z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) + return z + else: + # Forward computation + log_det_W = batch_size * n_of_groups * torch.logdet(W) + z = self.conv(z) + return z, log_det_W + + +class WN(torch.nn.Module): + """ + This is the WaveNet like layer for the affine coupling. The primary + difference from WaveNet is the convolutions need not be causal. There is + also no dilation size reset. The dilation only doubles on each layer + """ + + def __init__( + self, n_in_channels, n_mel_channels, n_layers, n_channels, kernel_size + ): + super(WN, self).__init__() + assert kernel_size % 2 == 1 + assert n_channels % 2 == 0 + self.n_layers = n_layers + self.n_channels = n_channels + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.cond_layers = torch.nn.ModuleList() + + start = torch.nn.Conv1d(n_in_channels, n_channels, 1) + start = torch.nn.utils.weight_norm(start, name="weight") + self.start = start + + # Initializing last layer to 0 makes the affine coupling layers + # do nothing at first. This helps with training stability + end = torch.nn.Conv1d(n_channels, 2 * n_in_channels, 1) + end.weight.data.zero_() + end.bias.data.zero_() + self.end = end + + for i in range(n_layers): + dilation = 2 ** i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d( + n_channels, + 2 * n_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + self.in_layers.append(in_layer) + + cond_layer = torch.nn.Conv1d(n_mel_channels, 2 * n_channels, 1) + cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + self.cond_layers.append(cond_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * n_channels + else: + res_skip_channels = n_channels + res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm( + res_skip_layer, name="weight" + ) + self.res_skip_layers.append(res_skip_layer) + + def forward(self, forward_input): + audio, spect = forward_input + audio = self.start(audio) + for i in range(self.n_layers): + acts = fused_add_tanh_sigmoid_multiply( + self.in_layers[i](audio), + self.cond_layers[i](spect), + torch.IntTensor([self.n_channels]), + ) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + audio = res_skip_acts[:, : self.n_channels, :] + audio + skip_acts = res_skip_acts[:, self.n_channels :, :] + else: + skip_acts = res_skip_acts + + if i == 0: + output = skip_acts + else: + output = skip_acts + output + return self.end(output) + + +class WaveGlow(torch.nn.Module): + def __init__( + self, + n_mel_channels, + n_flows, + n_group, + n_early_every, + n_early_size, + WN_config, + ): + super(WaveGlow, self).__init__() + + self.upsample = torch.nn.ConvTranspose1d( + n_mel_channels, n_mel_channels, 1024, stride=256 + ) + assert n_group % 2 == 0 + self.n_flows = n_flows + self.n_group = n_group + self.n_early_every = n_early_every + self.n_early_size = n_early_size + self.WN = torch.nn.ModuleList() + self.convinv = torch.nn.ModuleList() + + n_half = int(n_group / 2) + + # Set up layers with the right sizes based on how many dimensions + # have been output already + n_remaining_channels = n_group + for k in range(n_flows): + if k % self.n_early_every == 0 and k > 0: + n_half = n_half - int(self.n_early_size / 2) + n_remaining_channels = n_remaining_channels - self.n_early_size + self.convinv.append(Invertible1x1Conv(n_remaining_channels)) + self.WN.append(WN(n_half, n_mel_channels * n_group, **WN_config)) + self.n_remaining_channels = n_remaining_channels + # Useful during inference + + def forward(self, forward_input): + """ + forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames + forward_input[1] = audio: batch x time + """ + spect, audio = forward_input + + # Upsample spectrogram to size of audio + spect = self.upsample(spect) + assert spect.size(2) >= audio.size(1) + if spect.size(2) > audio.size(1): + spect = spect[:, :, : audio.size(1)] + + spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) + spect = ( + spect.contiguous() + .view(spect.size(0), spect.size(1), -1) + .permute(0, 2, 1) + ) + + audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1) + output_audio = [] + log_s_list = [] + log_det_W_list = [] + + for k in range(self.n_flows): + if k % self.n_early_every == 0 and k > 0: + output_audio.append(audio[:, : self.n_early_size, :]) + audio = audio[:, self.n_early_size :, :] + + audio, log_det_W = self.convinv[k](audio) + log_det_W_list.append(log_det_W) + + n_half = int(audio.size(1) / 2) + audio_0 = audio[:, :n_half, :] + audio_1 = audio[:, n_half:, :] + + output = self.WN[k]((audio_0, spect)) + log_s = output[:, n_half:, :] + b = output[:, :n_half, :] + audio_1 = torch.exp(log_s) * audio_1 + b + log_s_list.append(log_s) + + audio = torch.cat([audio_0, audio_1], 1) + + output_audio.append(audio) + return torch.cat(output_audio, 1), log_s_list, log_det_W_list + + def infer(self, spect, sigma=1.0): + spect = self.upsample(spect) + # trim conv artifacts. maybe pad spec to kernel multiple + time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0] + spect = spect[:, :, :-time_cutoff] + + spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) + spect = ( + spect.contiguous() + .view(spect.size(0), spect.size(1), -1) + .permute(0, 2, 1) + ) + + if spect.type() == "torch.cuda.HalfTensor": + audio = torch.cuda.HalfTensor( + spect.size(0), self.n_remaining_channels, spect.size(2) + ).normal_() + else: + # cuda.FloatTensor -> FloatTensor + audio = torch.FloatTensor( + spect.size(0), self.n_remaining_channels, spect.size(2) + ).normal_() + + audio = torch.autograd.Variable(sigma * audio) + + for k in reversed(range(self.n_flows)): + n_half = int(audio.size(1) / 2) + audio_0 = audio[:, :n_half, :] + audio_1 = audio[:, n_half:, :] + + output = self.WN[k]((audio_0, spect)) + s = output[:, n_half:, :] + b = output[:, :n_half, :] + audio_1 = (audio_1 - b) / torch.exp(s) + audio = torch.cat([audio_0, audio_1], 1) + + audio = self.convinv[k](audio, reverse=True) + + if k % self.n_early_every == 0 and k > 0: + if spect.type() == "torch.cuda.HalfTensor": + z = torch.cuda.HalfTensor( + spect.size(0), self.n_early_size, spect.size(2) + ).normal_() + else: + # cuda.FloatTensor -> FloatTensor + z = torch.FloatTensor( + spect.size(0), self.n_early_size, spect.size(2) + ).normal_() + audio = torch.cat((sigma * z, audio), 1) + + audio = ( + audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data + ) + return audio + + @staticmethod + def remove_weightnorm(model): + waveglow = model + for WN in waveglow.WN: + WN.start = torch.nn.utils.remove_weight_norm(WN.start) + WN.in_layers = remove(WN.in_layers) + WN.cond_layers = remove(WN.cond_layers) + WN.res_skip_layers = remove(WN.res_skip_layers) + return waveglow + + +def remove(conv_list): + new_conv_list = torch.nn.ModuleList() + for old_conv in conv_list: + old_conv = torch.nn.utils.remove_weight_norm(old_conv) + new_conv_list.append(old_conv) + return new_conv_list diff --git a/hparams.py b/hparams.py new file mode 100644 index 0000000..58cf525 --- /dev/null +++ b/hparams.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +import tensorflow as tf +from .text import symbols + + +# changed path, sampling rate and batch size +def create_hparams(hparams_string=None, verbose=False): + """Create model hyperparameters. Parse nondefault from given string.""" + + hparams = tf.contrib.training.HParams( + ################################ + # Experiment Parameters # + ################################ + epochs=500, + iters_per_checkpoint=1000, + seed=1234, + dynamic_loss_scaling=True, + fp16_run=False, + distributed_run=False, + dist_backend="nccl", + dist_url="tcp://localhost:54321", + cudnn_enabled=True, + cudnn_benchmark=False, + ignore_layers=["embedding.weight"], + ################################ + # Data Parameters # + ################################ + load_mel_from_disk=False, + training_files="lists/tts_data_train_processed.txt", + validation_files="filelists/tts_data_val_processed.txt", + text_cleaners=["english_cleaners"], + ################################ + # Audio Parameters # + ################################ + max_wav_value=32768.0, + sampling_rate=16000, + filter_length=1024, + hop_length=256, + win_length=1024, + n_mel_channels=80, + mel_fmin=0.0, + mel_fmax=8000.0, + ################################ + # Model Parameters # + ################################ + n_symbols=len(symbols), + symbols_embedding_dim=512, + # Encoder parameters + encoder_kernel_size=5, + encoder_n_convolutions=3, + encoder_embedding_dim=512, + # Decoder parameters + n_frames_per_step=1, # currently only 1 is supported + decoder_rnn_dim=1024, + prenet_dim=256, + max_decoder_steps=1000, + gate_threshold=0.5, + p_attention_dropout=0.1, + p_decoder_dropout=0.1, + # Attention parameters + attention_rnn_dim=1024, + attention_dim=128, + # Location Layer parameters + attention_location_n_filters=32, + attention_location_kernel_size=31, + # Mel-post processing network parameters + postnet_embedding_dim=512, + postnet_kernel_size=5, + postnet_n_convolutions=5, + ################################ + # Optimization Hyperparameters # + ################################ + use_saved_learning_rate=False, + learning_rate=1e-3, + weight_decay=1e-6, + grad_clip_thresh=1.0, + batch_size=4, + mask_padding=True, # set model's padded outputs to padded values + ) + + if hparams_string: + tf.logging.info("Parsing command line hparams: %s", hparams_string) + hparams.parse(hparams_string) + + if verbose: + tf.logging.info("Final parsed hparams: %s", hparams.values()) + + return hparams diff --git a/layers.py b/layers.py new file mode 100644 index 0000000..a16c941 --- /dev/null +++ b/layers.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- +import torch +from librosa.filters import mel as librosa_mel_fn +from .audio_processing import dynamic_range_compression +from .audio_processing import dynamic_range_decompression +from .stft import STFT + + +class LinearNorm(torch.nn.Module): + def __init__(self, in_dim, out_dim, bias=True, w_init_gain="linear"): + super(LinearNorm, self).__init__() + self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias) + + torch.nn.init.xavier_uniform_( + self.linear_layer.weight, + gain=torch.nn.init.calculate_gain(w_init_gain), + ) + + def forward(self, x): + return self.linear_layer(x) + + +class ConvNorm(torch.nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size=1, + stride=1, + padding=None, + dilation=1, + bias=True, + w_init_gain="linear", + ): + super(ConvNorm, self).__init__() + if padding is None: + assert kernel_size % 2 == 1 + padding = int(dilation * (kernel_size - 1) / 2) + + self.conv = torch.nn.Conv1d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=bias, + ) + + torch.nn.init.xavier_uniform_( + self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain) + ) + + def forward(self, signal): + conv_signal = self.conv(signal) + return conv_signal + + +class TacotronSTFT(torch.nn.Module): + def __init__( + self, + filter_length=1024, + hop_length=256, + win_length=1024, + n_mel_channels=80, + sampling_rate=22050, + mel_fmin=0.0, + mel_fmax=8000.0, + ): + super(TacotronSTFT, self).__init__() + self.n_mel_channels = n_mel_channels + self.sampling_rate = sampling_rate + self.stft_fn = STFT(filter_length, hop_length, win_length) + mel_basis = librosa_mel_fn( + sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax + ) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer("mel_basis", mel_basis) + + def spectral_normalize(self, magnitudes): + output = dynamic_range_compression(magnitudes) + return output + + def spectral_de_normalize(self, magnitudes): + output = dynamic_range_decompression(magnitudes) + return output + + def mel_spectrogram(self, y): + """Computes mel-spectrograms from a batch of waves + PARAMS + ------ + y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] + + RETURNS + ------- + mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) + """ + assert torch.min(y.data) >= -1 + assert torch.max(y.data) <= 1 + + magnitudes, phases = self.stft_fn.transform(y) + magnitudes = magnitudes.data + mel_output = torch.matmul(self.mel_basis, magnitudes) + mel_output = self.spectral_normalize(mel_output) + return mel_output diff --git a/loss_function.py b/loss_function.py new file mode 100644 index 0000000..6d88252 --- /dev/null +++ b/loss_function.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +from torch import nn + + +class Tacotron2Loss(nn.Module): + def __init__(self): + super(Tacotron2Loss, self).__init__() + + def forward(self, model_output, targets): + mel_target, gate_target = targets[0], targets[1] + mel_target.requires_grad = False + gate_target.requires_grad = False + gate_target = gate_target.view(-1, 1) + + mel_out, mel_out_postnet, gate_out, _ = model_output + gate_out = gate_out.view(-1, 1) + mel_loss = nn.MSELoss()(mel_out, mel_target) + nn.MSELoss()( + mel_out_postnet, mel_target + ) + gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target) + return mel_loss + gate_loss diff --git a/model.py b/model.py new file mode 100644 index 0000000..98c1c4a --- /dev/null +++ b/model.py @@ -0,0 +1,644 @@ +# -*- coding: utf-8 -*- +from math import sqrt +import torch +from torch.autograd import Variable +from torch import nn +from torch.nn import functional as F +from .layers import ConvNorm, LinearNorm +from .utils import to_gpu, get_mask_from_lengths + + +class LocationLayer(nn.Module): + def __init__( + self, attention_n_filters, attention_kernel_size, attention_dim + ): + super(LocationLayer, self).__init__() + padding = int((attention_kernel_size - 1) / 2) + self.location_conv = ConvNorm( + 2, + attention_n_filters, + kernel_size=attention_kernel_size, + padding=padding, + bias=False, + stride=1, + dilation=1, + ) + self.location_dense = LinearNorm( + attention_n_filters, attention_dim, bias=False, w_init_gain="tanh" + ) + + def forward(self, attention_weights_cat): + processed_attention = self.location_conv(attention_weights_cat) + processed_attention = processed_attention.transpose(1, 2) + processed_attention = self.location_dense(processed_attention) + return processed_attention + + +class Attention(nn.Module): + def __init__( + self, + attention_rnn_dim, + embedding_dim, + attention_dim, + attention_location_n_filters, + attention_location_kernel_size, + ): + super(Attention, self).__init__() + self.query_layer = LinearNorm( + attention_rnn_dim, attention_dim, bias=False, w_init_gain="tanh" + ) + self.memory_layer = LinearNorm( + embedding_dim, attention_dim, bias=False, w_init_gain="tanh" + ) + self.v = LinearNorm(attention_dim, 1, bias=False) + self.location_layer = LocationLayer( + attention_location_n_filters, + attention_location_kernel_size, + attention_dim, + ) + self.score_mask_value = -float("inf") + + def get_alignment_energies( + self, query, processed_memory, attention_weights_cat + ): + """ + PARAMS + ------ + query: decoder output (batch, n_mel_channels * n_frames_per_step) + processed_memory: processed encoder outputs (B, T_in, attention_dim) + attention_weights_cat: cumul. and prev. att weights (B, 2, max_time) + + RETURNS + ------- + alignment (batch, max_time) + """ + + processed_query = self.query_layer(query.unsqueeze(1)) + processed_attention_weights = self.location_layer( + attention_weights_cat + ) + energies = self.v( + torch.tanh( + processed_query + + processed_attention_weights + + processed_memory + ) + ) + + energies = energies.squeeze(-1) + return energies + + def forward( + self, + attention_hidden_state, + memory, + processed_memory, + attention_weights_cat, + mask, + ): + """ + PARAMS + ------ + attention_hidden_state: attention rnn last output + memory: encoder outputs + processed_memory: processed encoder outputs + attention_weights_cat: previous and cummulative attention weights + mask: binary mask for padded data + """ + alignment = self.get_alignment_energies( + attention_hidden_state, processed_memory, attention_weights_cat + ) + + if mask is not None: + alignment.data.masked_fill_(mask, self.score_mask_value) + + attention_weights = F.softmax(alignment, dim=1) + attention_context = torch.bmm(attention_weights.unsqueeze(1), memory) + attention_context = attention_context.squeeze(1) + + return attention_context, attention_weights + + +class Prenet(nn.Module): + def __init__(self, in_dim, sizes): + super(Prenet, self).__init__() + in_sizes = [in_dim] + sizes[:-1] + self.layers = nn.ModuleList( + [ + LinearNorm(in_size, out_size, bias=False) + for (in_size, out_size) in zip(in_sizes, sizes) + ] + ) + + def forward(self, x): + for linear in self.layers: + x = F.dropout(F.relu(linear(x)), p=0.5, training=True) + return x + + +class Postnet(nn.Module): + """Postnet + - Five 1-d convolution with 512 channels and kernel size 5 + """ + + def __init__(self, hparams): + super(Postnet, self).__init__() + self.convolutions = nn.ModuleList() + + self.convolutions.append( + nn.Sequential( + ConvNorm( + hparams.n_mel_channels, + hparams.postnet_embedding_dim, + kernel_size=hparams.postnet_kernel_size, + stride=1, + padding=int((hparams.postnet_kernel_size - 1) / 2), + dilation=1, + w_init_gain="tanh", + ), + nn.BatchNorm1d(hparams.postnet_embedding_dim), + ) + ) + + for i in range(1, hparams.postnet_n_convolutions - 1): + self.convolutions.append( + nn.Sequential( + ConvNorm( + hparams.postnet_embedding_dim, + hparams.postnet_embedding_dim, + kernel_size=hparams.postnet_kernel_size, + stride=1, + padding=int((hparams.postnet_kernel_size - 1) / 2), + dilation=1, + w_init_gain="tanh", + ), + nn.BatchNorm1d(hparams.postnet_embedding_dim), + ) + ) + + self.convolutions.append( + nn.Sequential( + ConvNorm( + hparams.postnet_embedding_dim, + hparams.n_mel_channels, + kernel_size=hparams.postnet_kernel_size, + stride=1, + padding=int((hparams.postnet_kernel_size - 1) / 2), + dilation=1, + w_init_gain="linear", + ), + nn.BatchNorm1d(hparams.n_mel_channels), + ) + ) + + def forward(self, x): + for i in range(len(self.convolutions) - 1): + x = F.dropout( + torch.tanh(self.convolutions[i](x)), 0.5, self.training + ) + x = F.dropout(self.convolutions[-1](x), 0.5, self.training) + + return x + + +class Encoder(nn.Module): + """Encoder module: + - Three 1-d convolution banks + - Bidirectional LSTM + """ + + def __init__(self, hparams): + super(Encoder, self).__init__() + + convolutions = [] + for _ in range(hparams.encoder_n_convolutions): + conv_layer = nn.Sequential( + ConvNorm( + hparams.encoder_embedding_dim, + hparams.encoder_embedding_dim, + kernel_size=hparams.encoder_kernel_size, + stride=1, + padding=int((hparams.encoder_kernel_size - 1) / 2), + dilation=1, + w_init_gain="relu", + ), + nn.BatchNorm1d(hparams.encoder_embedding_dim), + ) + convolutions.append(conv_layer) + self.convolutions = nn.ModuleList(convolutions) + + self.lstm = nn.LSTM( + hparams.encoder_embedding_dim, + int(hparams.encoder_embedding_dim / 2), + 1, + batch_first=True, + bidirectional=True, + ) + + def forward(self, x, input_lengths): + for conv in self.convolutions: + x = F.dropout(F.relu(conv(x)), 0.5, self.training) + + x = x.transpose(1, 2) + + # pytorch tensor are not reversible, hence the conversion + input_lengths = input_lengths.cpu().numpy() + x = nn.utils.rnn.pack_padded_sequence( + x, input_lengths, batch_first=True + ) + + self.lstm.flatten_parameters() + outputs, _ = self.lstm(x) + + outputs, _ = nn.utils.rnn.pad_packed_sequence( + outputs, batch_first=True + ) + + return outputs + + def inference(self, x): + for conv in self.convolutions: + x = F.dropout(F.relu(conv(x)), 0.5, self.training) + + x = x.transpose(1, 2) + + self.lstm.flatten_parameters() + outputs, _ = self.lstm(x) + + return outputs + + +class Decoder(nn.Module): + def __init__(self, hparams): + super(Decoder, self).__init__() + self.n_mel_channels = hparams.n_mel_channels + self.n_frames_per_step = hparams.n_frames_per_step + self.encoder_embedding_dim = hparams.encoder_embedding_dim + self.attention_rnn_dim = hparams.attention_rnn_dim + self.decoder_rnn_dim = hparams.decoder_rnn_dim + self.prenet_dim = hparams.prenet_dim + self.max_decoder_steps = hparams.max_decoder_steps + self.gate_threshold = hparams.gate_threshold + self.p_attention_dropout = hparams.p_attention_dropout + self.p_decoder_dropout = hparams.p_decoder_dropout + + self.prenet = Prenet( + hparams.n_mel_channels * hparams.n_frames_per_step, + [hparams.prenet_dim, hparams.prenet_dim], + ) + + self.attention_rnn = nn.LSTMCell( + hparams.prenet_dim + hparams.encoder_embedding_dim, + hparams.attention_rnn_dim, + ) + + self.attention_layer = Attention( + hparams.attention_rnn_dim, + hparams.encoder_embedding_dim, + hparams.attention_dim, + hparams.attention_location_n_filters, + hparams.attention_location_kernel_size, + ) + + self.decoder_rnn = nn.LSTMCell( + hparams.attention_rnn_dim + hparams.encoder_embedding_dim, + hparams.decoder_rnn_dim, + 1, + ) + + self.linear_projection = LinearNorm( + hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, + hparams.n_mel_channels * hparams.n_frames_per_step, + ) + + self.gate_layer = LinearNorm( + hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, + 1, + bias=True, + w_init_gain="sigmoid", + ) + + def get_go_frame(self, memory): + """ Gets all zeros frames to use as first decoder input + PARAMS + ------ + memory: decoder outputs + + RETURNS + ------- + decoder_input: all zeros frames + """ + B = memory.size(0) + decoder_input = Variable( + memory.data.new( + B, self.n_mel_channels * self.n_frames_per_step + ).zero_() + ) + return decoder_input + + def initialize_decoder_states(self, memory, mask): + """ Initializes attention rnn states, decoder rnn states, attention + weights, attention cumulative weights, attention context, stores memory + and stores processed memory + PARAMS + ------ + memory: Encoder outputs + mask: Mask for padded data if training, expects None for inference + """ + B = memory.size(0) + MAX_TIME = memory.size(1) + + self.attention_hidden = Variable( + memory.data.new(B, self.attention_rnn_dim).zero_() + ) + self.attention_cell = Variable( + memory.data.new(B, self.attention_rnn_dim).zero_() + ) + + self.decoder_hidden = Variable( + memory.data.new(B, self.decoder_rnn_dim).zero_() + ) + self.decoder_cell = Variable( + memory.data.new(B, self.decoder_rnn_dim).zero_() + ) + + self.attention_weights = Variable(memory.data.new(B, MAX_TIME).zero_()) + self.attention_weights_cum = Variable( + memory.data.new(B, MAX_TIME).zero_() + ) + self.attention_context = Variable( + memory.data.new(B, self.encoder_embedding_dim).zero_() + ) + + self.memory = memory + self.processed_memory = self.attention_layer.memory_layer(memory) + self.mask = mask + + def parse_decoder_inputs(self, decoder_inputs): + """ Prepares decoder inputs, i.e. mel outputs + PARAMS + ------ + decoder_inputs: inputs used for teacher-forced training, i.e. mel-specs + + RETURNS + ------- + inputs: processed decoder inputs + + """ + # (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels) + decoder_inputs = decoder_inputs.transpose(1, 2) + decoder_inputs = decoder_inputs.view( + decoder_inputs.size(0), + int(decoder_inputs.size(1) / self.n_frames_per_step), + -1, + ) + # (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels) + decoder_inputs = decoder_inputs.transpose(0, 1) + return decoder_inputs + + def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments): + """ Prepares decoder outputs for output + PARAMS + ------ + mel_outputs: + gate_outputs: gate output energies + alignments: + + RETURNS + ------- + mel_outputs: + gate_outpust: gate output energies + alignments: + """ + # (T_out, B) -> (B, T_out) + alignments = torch.stack(alignments).transpose(0, 1) + # (T_out, B) -> (B, T_out) + gate_outputs = torch.stack(gate_outputs).transpose(0, 1) + gate_outputs = gate_outputs.contiguous() + # (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels) + mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous() + # decouple frames per step + mel_outputs = mel_outputs.view( + mel_outputs.size(0), -1, self.n_mel_channels + ) + # (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out) + mel_outputs = mel_outputs.transpose(1, 2) + + return mel_outputs, gate_outputs, alignments + + def decode(self, decoder_input): + """ Decoder step using stored states, attention and memory + PARAMS + ------ + decoder_input: previous mel output + + RETURNS + ------- + mel_output: + gate_output: gate output energies + attention_weights: + """ + cell_input = torch.cat((decoder_input, self.attention_context), -1) + self.attention_hidden, self.attention_cell = self.attention_rnn( + cell_input, (self.attention_hidden, self.attention_cell) + ) + self.attention_hidden = F.dropout( + self.attention_hidden, self.p_attention_dropout, self.training + ) + + attention_weights_cat = torch.cat( + ( + self.attention_weights.unsqueeze(1), + self.attention_weights_cum.unsqueeze(1), + ), + dim=1, + ) + self.attention_context, self.attention_weights = self.attention_layer( + self.attention_hidden, + self.memory, + self.processed_memory, + attention_weights_cat, + self.mask, + ) + + self.attention_weights_cum += self.attention_weights + decoder_input = torch.cat( + (self.attention_hidden, self.attention_context), -1 + ) + self.decoder_hidden, self.decoder_cell = self.decoder_rnn( + decoder_input, (self.decoder_hidden, self.decoder_cell) + ) + self.decoder_hidden = F.dropout( + self.decoder_hidden, self.p_decoder_dropout, self.training + ) + + decoder_hidden_attention_context = torch.cat( + (self.decoder_hidden, self.attention_context), dim=1 + ) + decoder_output = self.linear_projection( + decoder_hidden_attention_context + ) + + gate_prediction = self.gate_layer(decoder_hidden_attention_context) + return decoder_output, gate_prediction, self.attention_weights + + def forward(self, memory, decoder_inputs, memory_lengths): + """ Decoder forward pass for training + PARAMS + ------ + memory: Encoder outputs + decoder_inputs: Decoder inputs for teacher forcing. i.e. mel-specs + memory_lengths: Encoder output lengths for attention masking. + + RETURNS + ------- + mel_outputs: mel outputs from the decoder + gate_outputs: gate outputs from the decoder + alignments: sequence of attention weights from the decoder + """ + + decoder_input = self.get_go_frame(memory).unsqueeze(0) + decoder_inputs = self.parse_decoder_inputs(decoder_inputs) + decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0) + decoder_inputs = self.prenet(decoder_inputs) + + self.initialize_decoder_states( + memory, mask=~get_mask_from_lengths(memory_lengths) + ) + + mel_outputs, gate_outputs, alignments = [], [], [] + while len(mel_outputs) < decoder_inputs.size(0) - 1: + decoder_input = decoder_inputs[len(mel_outputs)] + mel_output, gate_output, attention_weights = self.decode( + decoder_input + ) + mel_outputs += [mel_output.squeeze(1)] + gate_outputs += [gate_output.squeeze()] + alignments += [attention_weights] + + mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs( + mel_outputs, gate_outputs, alignments + ) + + return mel_outputs, gate_outputs, alignments + + def inference(self, memory): + """ Decoder inference + PARAMS + ------ + memory: Encoder outputs + + RETURNS + ------- + mel_outputs: mel outputs from the decoder + gate_outputs: gate outputs from the decoder + alignments: sequence of attention weights from the decoder + """ + decoder_input = self.get_go_frame(memory) + + self.initialize_decoder_states(memory, mask=None) + + mel_outputs, gate_outputs, alignments = [], [], [] + while True: + decoder_input = self.prenet(decoder_input) + mel_output, gate_output, alignment = self.decode(decoder_input) + + mel_outputs += [mel_output.squeeze(1)] + gate_outputs += [gate_output] + alignments += [alignment] + + if torch.sigmoid(gate_output.data) > self.gate_threshold: + break + elif len(mel_outputs) == self.max_decoder_steps: + print("Warning! Reached max decoder steps") + break + + decoder_input = mel_output + + mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs( + mel_outputs, gate_outputs, alignments + ) + + return mel_outputs, gate_outputs, alignments + + +class Tacotron2(nn.Module): + def __init__(self, hparams): + super(Tacotron2, self).__init__() + self.mask_padding = hparams.mask_padding + self.fp16_run = hparams.fp16_run + self.n_mel_channels = hparams.n_mel_channels + self.n_frames_per_step = hparams.n_frames_per_step + self.embedding = nn.Embedding( + hparams.n_symbols, hparams.symbols_embedding_dim + ) + std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim)) + val = sqrt(3.0) * std # uniform bounds for std + self.embedding.weight.data.uniform_(-val, val) + self.encoder = Encoder(hparams) + self.decoder = Decoder(hparams) + self.postnet = Postnet(hparams) + + def parse_batch(self, batch): + text_padded, input_lengths, mel_padded, gate_padded, output_lengths = ( + batch + ) + text_padded = to_gpu(text_padded).long() + input_lengths = to_gpu(input_lengths).long() + max_len = torch.max(input_lengths.data).item() + mel_padded = to_gpu(mel_padded).float() + gate_padded = to_gpu(gate_padded).float() + output_lengths = to_gpu(output_lengths).long() + + return ( + (text_padded, input_lengths, mel_padded, max_len, output_lengths), + (mel_padded, gate_padded), + ) + + def parse_output(self, outputs, output_lengths=None): + if self.mask_padding and output_lengths is not None: + mask = ~get_mask_from_lengths(output_lengths) + mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1)) + mask = mask.permute(1, 0, 2) + + outputs[0].data.masked_fill_(mask, 0.0) + outputs[1].data.masked_fill_(mask, 0.0) + outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies + + return outputs + + def forward(self, inputs): + text_inputs, text_lengths, mels, max_len, output_lengths = inputs + text_lengths, output_lengths = text_lengths.data, output_lengths.data + + embedded_inputs = self.embedding(text_inputs).transpose(1, 2) + + encoder_outputs = self.encoder(embedded_inputs, text_lengths) + + mel_outputs, gate_outputs, alignments = self.decoder( + encoder_outputs, mels, memory_lengths=text_lengths + ) + + mel_outputs_postnet = self.postnet(mel_outputs) + mel_outputs_postnet = mel_outputs + mel_outputs_postnet + + return self.parse_output( + [mel_outputs, mel_outputs_postnet, gate_outputs, alignments], + output_lengths, + ) + + def inference(self, inputs): + embedded_inputs = self.embedding(inputs).transpose(1, 2) + encoder_outputs = self.encoder.inference(embedded_inputs) + mel_outputs, gate_outputs, alignments = self.decoder.inference( + encoder_outputs + ) + + mel_outputs_postnet = self.postnet(mel_outputs) + mel_outputs_postnet = mel_outputs + mel_outputs_postnet + + outputs = self.parse_output( + [mel_outputs, mel_outputs_postnet, gate_outputs, alignments] + ) + + return outputs diff --git a/server.py b/server.py new file mode 100644 index 0000000..d41119a --- /dev/null +++ b/server.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +import grpc +import time +from sia.proto import tts_pb2 +from sia.proto import tts_pb2_grpc +from concurrent import futures +from .tts import TTSModel + + +class TTSServer: + def __init__(self): + self.tts_model = TTSModel() + + def TextToSpeechAPI(self, request, context): + while True: + input_text = request.text + speech_response = self.tts_model.synth_speech(input_text) + return tts_pb2.SpeechResponse(response=speech_response) + + +def main(): + server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) + tts_server = TTSServer() + tts_pb2_grpc.add_ServerServicer_to_server(tts_server, server) + server.add_insecure_port("localhost:50060") + server.start() + print("TTSServer started!") + + try: + while True: + time.sleep(10000) + except KeyboardInterrupt: + server.start() + # server.stop(0) + + +if __name__ == "__main__": + main() diff --git a/stft.py b/stft.py new file mode 100644 index 0000000..eeaa94d --- /dev/null +++ b/stft.py @@ -0,0 +1,172 @@ +# -*- coding: utf-8 -*- +""" +BSD 3-Clause License + +Copyright (c) 2017, Prem Seetharaman +All rights reserved. + +* Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +""" + +import torch +import numpy as np +import torch.nn.functional as F +from torch.autograd import Variable +from scipy.signal import get_window +from librosa.util import pad_center, tiny +from .audio_processing import window_sumsquare + + +class STFT(torch.nn.Module): + """ + adapted from Prem Seetharaman's + https://github.com/pseeth/pytorch-stft + """ + + def __init__( + self, filter_length=800, hop_length=200, win_length=800, window="hann" + ): + super(STFT, self).__init__() + self.filter_length = filter_length + self.hop_length = hop_length + self.win_length = win_length + self.window = window + self.forward_transform = None + scale = self.filter_length / self.hop_length + fourier_basis = np.fft.fft(np.eye(self.filter_length)) + + cutoff = int((self.filter_length / 2 + 1)) + fourier_basis = np.vstack( + [ + np.real(fourier_basis[:cutoff, :]), + np.imag(fourier_basis[:cutoff, :]), + ] + ) + + forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) + inverse_basis = torch.FloatTensor( + np.linalg.pinv(scale * fourier_basis).T[:, None, :] + ) + + if window is not None: + assert filter_length >= win_length + # get window and zero center pad it to filter_length + fft_window = get_window(window, win_length, fftbins=True) + fft_window = pad_center(fft_window, filter_length) + fft_window = torch.from_numpy(fft_window).float() + + # window the bases + forward_basis *= fft_window + inverse_basis *= fft_window + + self.register_buffer("forward_basis", forward_basis.float()) + self.register_buffer("inverse_basis", inverse_basis.float()) + + def transform(self, input_data): + num_batches = input_data.size(0) + num_samples = input_data.size(1) + + self.num_samples = num_samples + + # similar to librosa, reflect-pad the input + input_data = input_data.view(num_batches, 1, num_samples) + input_data = F.pad( + input_data.unsqueeze(1), + (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), + mode="reflect", + ) + input_data = input_data.squeeze(1) + + forward_transform = F.conv1d( + input_data, + Variable(self.forward_basis, requires_grad=False), + stride=self.hop_length, + padding=0, + ) + + cutoff = int((self.filter_length / 2) + 1) + real_part = forward_transform[:, :cutoff, :] + imag_part = forward_transform[:, cutoff:, :] + + magnitude = torch.sqrt(real_part ** 2 + imag_part ** 2) + phase = torch.autograd.Variable( + torch.atan2(imag_part.data, real_part.data) + ) + + return magnitude, phase + + def inverse(self, magnitude, phase): + recombine_magnitude_phase = torch.cat( + [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1 + ) + + inverse_transform = F.conv_transpose1d( + recombine_magnitude_phase, + Variable(self.inverse_basis, requires_grad=False), + stride=self.hop_length, + padding=0, + ) + + if self.window is not None: + window_sum = window_sumsquare( + self.window, + magnitude.size(-1), + hop_length=self.hop_length, + win_length=self.win_length, + n_fft=self.filter_length, + dtype=np.float32, + ) + # remove modulation effects + approx_nonzero_indices = torch.from_numpy( + np.where(window_sum > tiny(window_sum))[0] + ) + window_sum = torch.autograd.Variable( + torch.from_numpy(window_sum), requires_grad=False + ) + # window_sum = window_sum.cuda() if magnitude.is_cuda else + # window_sum + # initially not commented out + inverse_transform[:, :, approx_nonzero_indices] /= window_sum[ + approx_nonzero_indices + ] + + # scale by hop ratio + inverse_transform *= float(self.filter_length) / self.hop_length + + inverse_transform = inverse_transform[ + :, :, int(self.filter_length / 2) : + ] + inverse_transform = inverse_transform[ + :, :, : -int(self.filter_length / 2) : + ] + + return inverse_transform + + def forward(self, input_data): + self.magnitude, self.phase = self.transform(input_data) + reconstruction = self.inverse(self.magnitude, self.phase) + return reconstruction diff --git a/tts.py b/tts.py index 23bf8b2..d1b6028 100644 --- a/tts.py +++ b/tts.py @@ -6,6 +6,7 @@ import torch from .hparams import create_hparams from .text import text_to_sequence from .glow import WaveGlow + # import os # import soundfile as sf import pyaudio @@ -26,11 +27,7 @@ WAVEGLOW_CONFIG = { "n_group": 8, "n_early_every": 4, "n_early_size": 2, - "WN_config": { - "n_layers": 8, - "n_channels": 256, - "kernel_size": 3 - } + "WN_config": {"n_layers": 8, "n_channels": 256, "kernel_size": 3}, } @@ -44,32 +41,36 @@ class TTSModel(object): self.model = Tacotron2(hparams) tacotron2_path = cached_model_path("tacotron2_model") self.model.load_state_dict( - torch.load(tacotron2_path, map_location='cpu')['state_dict']) + torch.load(tacotron2_path, map_location="cpu")["state_dict"] + ) self.model.eval() - waveglow_path = cached_model_path('waveglow_model') + waveglow_path = cached_model_path("waveglow_model") self.waveglow = WaveGlow(**WAVEGLOW_CONFIG) - wave_params = torch.load(waveglow_path, map_location='cpu') + wave_params = torch.load(waveglow_path, map_location="cpu") self.waveglow.load_state_dict(wave_params) self.waveglow.eval() for k in self.waveglow.convinv: k.float() self.k_cache = klepto.archives.file_archive(cached=False) self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)( - self.synth_speech) + self.synth_speech + ) # https://github.com/NVIDIA/waveglow/issues/127 for m in self.waveglow.modules(): - if 'Conv' in str(type(m)): - setattr(m, 'padding_mode', 'zeros') + if "Conv" in str(type(m)): + setattr(m, "padding_mode", "zeros") @do_time def synth_speech(self, t): text = t - sequence = np.array(text_to_sequence(text, - ['english_cleaners']))[None, :] + sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[ + None, : + ] sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long() mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference( - sequence) + sequence + ) with torch.no_grad(): audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666) audio = audio_t[0].data.cpu().numpy() @@ -92,7 +93,7 @@ class TTSModel(object): # https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py -def float2pcm(sig, dtype='int16'): +def float2pcm(sig, dtype="int16"): """Convert floating point signal with a range from -1 to 1 to PCM. Any signal values outside the interval [-1.0, 1.0) are clipped. No dithering is used. @@ -116,30 +117,33 @@ def float2pcm(sig, dtype='int16'): pcm2float, dtype """ sig = np.asarray(sig) - if sig.dtype.kind != 'f': + if sig.dtype.kind != "f": raise TypeError("'sig' must be a float array") dtype = np.dtype(dtype) - if dtype.kind not in 'iu': + if dtype.kind not in "iu": raise TypeError("'dtype' must be an integer type") i = np.iinfo(dtype) - abs_max = 2**(i.bits - 1) + abs_max = 2 ** (i.bits - 1) offset = i.min + abs_max return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype) def display(data): import IPython.display as ipd + aud = ipd.Audio(data, rate=16000) return aud def player_gen(): audio_interface = pyaudio.PyAudio() - _audio_stream = audio_interface.open(format=pyaudio.paInt16, - channels=1, - rate=OUTPUT_SAMPLE_RATE, - output=True) + _audio_stream = audio_interface.open( + format=pyaudio.paInt16, + channels=1, + rate=OUTPUT_SAMPLE_RATE, + output=True, + ) def play_device(data): _audio_stream.write(data) @@ -151,7 +155,7 @@ def player_gen(): def synthesize_corpus(): tts_model = TTSModel() all_data = [] - for (i, line) in enumerate(open('corpus.txt').readlines()): + for (i, line) in enumerate(open("corpus.txt").readlines()): print('synthesizing... "{}"'.format(line.strip())) data = tts_model.synth_speech(line.strip()) all_data.append(data) @@ -168,8 +172,9 @@ def main(): corpus_synth_data = synthesize_corpus() play_corpus(corpus_synth_data) import ipdb + ipdb.set_trace() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..f779bf5 --- /dev/null +++ b/utils.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +import numpy as np +from scipy.io.wavfile import read +import torch + + +def get_mask_from_lengths(lengths): + max_len = torch.max(lengths).item() + ids = torch.arange( + 0, max_len, out=torch.LongTensor(max_len) + ) # initially out = torch.LongTensor(max_len) + mask = (ids < lengths.unsqueeze(1)).byte() + return mask + + +def load_wav_to_torch(full_path): + sampling_rate, data = read(full_path) + return torch.FloatTensor(data.astype(np.float32)), sampling_rate + + +def load_filepaths_and_text(filename, split="|"): + with open(filename, encoding="utf-8") as f: + filepaths_and_text = [line.strip().split(split) for line in f] + return filepaths_and_text + + +def to_gpu(x): + x = x.contiguous() + + # if torch.cuda.is_available(): #initially not commented out + # x = x.cuda(non_blocking=True) # initially not commented out + return torch.autograd.Variable(x)