From 08ad9ce16e6561cdb4225388f573141d26dc5667 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Tue, 2 Jul 2019 16:16:44 +0530 Subject: [PATCH] 1. clean-up redundant code 2. remove ffmpeg dependency using librosa api 3. remove waveglow submodule --- .gitmodules | 4 - README.md | 90 +++----------- WORKFLOW.md | 23 ---- demo_client.py | 22 ++-- final.ipynb | 232 ------------------------------------ final.py | 126 -------------------- glow.py | 311 +++++++++++++++++++++++++++++++++++++++++++++++++ loss_scaler.py | 131 --------------------- multiproc.py | 23 ---- tts.py | 76 ++++++++---- waveglow | 1 - 11 files changed, 396 insertions(+), 643 deletions(-) delete mode 100644 .gitmodules mode change 100755 => 100644 README.md delete mode 100644 WORKFLOW.md delete mode 100644 final.ipynb delete mode 100644 final.py create mode 100644 glow.py delete mode 100644 loss_scaler.py delete mode 100644 multiproc.py delete mode 160000 waveglow diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 051ba22..0000000 --- a/.gitmodules +++ /dev/null @@ -1,4 +0,0 @@ -[submodule "waveglow"] - path = waveglow - url = https://github.com/NVIDIA/waveglow - branch = master diff --git a/README.md b/README.md old mode 100755 new mode 100644 index 14cfc78..a953b8f --- a/README.md +++ b/README.md @@ -1,81 +1,23 @@ -# Tacotron 2 (without wavenet) - -PyTorch implementation of [Natural TTS Synthesis By Conditioning -Wavenet On Mel Spectrogram Predictions](https://arxiv.org/pdf/1712.05884.pdf). - -This implementation includes **distributed** and **automatic mixed precision** support -and uses the [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/). - -Distributed and Automatic Mixed Precision support relies on NVIDIA's [Apex] and [AMP]. - -Visit our [website] for audio samples using our published [Tacotron 2] and -[WaveGlow] models. - -![Alignment, Predicted Mel Spectrogram, Target Mel Spectrogram](tensorboard.png) - - -## Pre-requisites -1. NVIDIA GPU + CUDA cuDNN ## Setup -1. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/) -2. Clone this repo: `git clone https://github.com/NVIDIA/tacotron2.git` -3. CD into this repo: `cd tacotron2` -4. Initialize submodule: `git submodule init; git submodule update` -5. Update .wav paths: `sed -i -- 's,DUMMY,ljs_dataset_folder/wavs,g' filelists/*.txt` - - Alternatively, set `load_mel_from_disk=True` in `hparams.py` and update mel-spectrogram paths -6. Install [PyTorch 1.0] -7. Install [Apex] -8. Install python requirements or build docker image - - Install python requirements: `pip install -r requirements.txt` +- clone the repo -## Training -1. `python train.py --output_directory=outdir --log_directory=logdir` -2. (OPTIONAL) `tensorboard --logdir=outdir/logdir` +`git clone https://github.com/agaralabs/tacotron2` +- cd to `tacotron2` copy models from wolverine: -## Training using a pre-trained model -Training using a pre-trained model can lead to faster convergence -By default, the dataset dependent text embedding layers are [ignored] +`scp wolverine:/home/ubuntu/tacotron2/{checkpoint_15000,waveglow_256channels.pt} ./` -1. Download our published [Tacotron 2] model -2. `python train.py --output_directory=outdir --log_directory=logdir -c tacotron2_statedict.pt --warm_start` +`scp wolverine:/home/ubuntu/tacotron2/waveglow ./` -## Multi-GPU (distributed) and Automatic Mixed Precision Training -1. `python -m multiproc train.py --output_directory=outdir --log_directory=logdir --hparams=distributed_run=True,fp16_run=True` +**Wolverine Details:** +``` +Host wolverine + Hostname 54.71.137.17 + User ubuntu + IdentityFile ~/.ssh/id_hip_ml +``` +install the dependencies +`pip install requirements.txt` -## Inference demo -1. Download our published [Tacotron 2] model -2. Download our published [WaveGlow] model -3. `jupyter notebook --ip=127.0.0.1 --port=31337` -4. Load inference.ipynb - -N.b. When performing Mel-Spectrogram to Audio synthesis, make sure Tacotron 2 -and the Mel decoder were trained on the same mel-spectrogram representation. - - -## Related repos -[WaveGlow](https://github.com/NVIDIA/WaveGlow) Faster than real time Flow-based -Generative Network for Speech Synthesis - -[nv-wavenet](https://github.com/NVIDIA/nv-wavenet/) Faster than real time -WaveNet. - -## Acknowledgements -This implementation uses code from the following repos: [Keith -Ito](https://github.com/keithito/tacotron/), [Prem -Seetharaman](https://github.com/pseeth/pytorch-stft) as described in our code. - -We are inspired by [Ryuchi Yamamoto's](https://github.com/r9y9/tacotron_pytorch) -Tacotron PyTorch implementation. - -We are thankful to the Tacotron 2 paper authors, specially Jonathan Shen, Yuxuan -Wang and Zongheng Yang. - - -[WaveGlow]: https://drive.google.com/file/d/1WsibBTsuRg_SF2Z6L6NFRTT-NjEy1oTx/view?usp=sharing -[Tacotron 2]: https://drive.google.com/file/d/1c5ZTuT7J08wLUoVZ2KkUs_VdZuJ86ZqA/view?usp=sharing -[pytorch 1.0]: https://github.com/pytorch/pytorch#installation -[website]: https://nv-adlr.github.io/WaveGlow -[ignored]: https://github.com/NVIDIA/tacotron2/blob/master/hparams.py#L22 -[Apex]: https://github.com/nvidia/apex -[AMP]: https://github.com/NVIDIA/apex/tree/master/apex/amp +## Running: +`python final.py` diff --git a/WORKFLOW.md b/WORKFLOW.md deleted file mode 100644 index a953b8f..0000000 --- a/WORKFLOW.md +++ /dev/null @@ -1,23 +0,0 @@ - -## Setup -- clone the repo - -`git clone https://github.com/agaralabs/tacotron2` -- cd to `tacotron2` copy models from wolverine: - -`scp wolverine:/home/ubuntu/tacotron2/{checkpoint_15000,waveglow_256channels.pt} ./` - -`scp wolverine:/home/ubuntu/tacotron2/waveglow ./` - -**Wolverine Details:** -``` -Host wolverine - Hostname 54.71.137.17 - User ubuntu - IdentityFile ~/.ssh/id_hip_ml -``` -install the dependencies -`pip install requirements.txt` - -## Running: -`python final.py` diff --git a/demo_client.py b/demo_client.py index bc2a12e..b00a53a 100644 --- a/demo_client.py +++ b/demo_client.py @@ -4,15 +4,23 @@ from sia.proto import tts_pb2_grpc from tts import player_gen -def main(): +def tts_player(): + player = player_gen() channel = grpc.insecure_channel('localhost:50060') stub = tts_pb2_grpc.ServerStub(channel) - test_text = tts_pb2.TextInput(text='How may I help you today?') - speech = stub.TextToSpeechAPI(test_text) - player = player_gen() - player(speech.response) - import pdb - pdb.set_trace() + + def play(t): + test_text = tts_pb2.TextInput(text=t) + speech = stub.TextToSpeechAPI(test_text) + player(speech.response) + return play + + +def main(): + play = tts_player() + play('How may I help you today?') + import ipdb + ipdb.set_trace() if __name__ == '__main__': diff --git a/final.ipynb b/final.ipynb deleted file mode 100644 index 29926f3..0000000 --- a/final.ipynb +++ /dev/null @@ -1,232 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib\n", - "#%matplotlib inline\n", - "import matplotlib.pylab as plt\n", - "\n", - "import IPython.display as ipd\n", - "\n", - "import sys\n", - "sys.path.append('waveglow/')\n", - "import numpy as np\n", - "import torch\n", - "\n", - "from hparams import create_hparams\n", - "from model import Tacotron2\n", - "from layers import TacotronSTFT, STFT\n", - "from audio_processing import griffin_lim\n", - "from train import load_model\n", - "from text import text_to_sequence\n", - "from denoiser import Denoiser" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "hparams = create_hparams()\n", - "hparams.sampling_rate = 22050\n", - "checkpoint_path = \"checkpoint_15000\"\n", - "model = load_model(hparams)\n", - "model.load_state_dict(torch.load(checkpoint_path, map_location = 'cpu')['state_dict']) #added map_location = 'cpu'\n", - "_ = model.eval() #it was originally model.cuda().eval().half()\n", - "waveglow_path = 'waveglow_256channels.pt'\n", - "waveglow = torch.load(waveglow_path, map_location = 'cpu')['model'] #added map_location = 'cpu'\n", - "waveglow.eval() #originally waveglow.cuda().eval().half()\n", - "for k in waveglow.convinv:\n", - " k.float()\n", - "#denoiser = Denoiser(waveglow)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import soundfile as sf" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import time" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def convert(array):\n", - " sf.write('sample.wav', array, 22050)\n", - " os.system('ffmpeg -i {0} -filter:a \"atempo=0.80\" {1}'.format('sample.wav', 'sample0.wav'))\n", - " #os.system('ffmpeg -i {0} -ar 8000 {1}'.format('sample0.wav', 'sample1.wav'))\n", - " data, rate = sf.read('sample0.wav')\n", - " os.remove('sample.wav')\n", - " os.remove('sample0.wav')\n", - " #os.remove('sample1.wav')\n", - " return data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def speech(t):\n", - " start = time.time()\n", - " text = t\n", - " sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]\n", - " print(sequence)\n", - " sequence = torch.autograd.Variable(\n", - " torch.from_numpy(sequence)).long() #originally torch.from_numpy(sequence)).cuda().long()\n", - " mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)\n", - " with torch.no_grad():\n", - " audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)\n", - " #audio_denoised = denoiser(audio, strength=0.01)[:, 0]\n", - " data = convert(audio[0].data.cpu().numpy())\n", - " #os.system('ffmpeg -i {0} -filter:a \"atempo=0.85\" {1}'.format('harvard_inference/audio/'+str(i)+'.wav', 'harvard_inference/audio_0.85/'+str(i)+'.wav'))\n", - " aud = ipd.Audio(data, rate=22050)\n", - " end = time.time()\n", - " print(end-start)\n", - " return aud" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "speech('I understand your frustration and disappointment. I am sorry that its happening and I would like to help prevent it in the future. What style of diapers did you buy? For instance, was it the snugglers, pull ups or baby dry.')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from final import display,speech,play_device\n", - "import pyaudio" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "15.046638011932373\n" - ] - } - ], - "source": [ - "data = speech('Thank you for calling Huggies. How may I help you today .')" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "display(data)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "def play_device(data):\n", - " audio_interface = pyaudio.PyAudio()\n", - " _audio_stream = audio_interface.open(format=pyaudio.paInt16,channels=1, rate=16000,output=True)\n", - " _audio_stream.write(data.tostring())\n", - "# _audio_stream.close()\n", - "play_device(data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/final.py b/final.py deleted file mode 100644 index 0b826b2..0000000 --- a/final.py +++ /dev/null @@ -1,126 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# import matplotlib -# import matplotlib.pylab as plt - -# import IPython.display as ipd - -import sys -import numpy as np -import torch -from hparams import create_hparams -from model import Tacotron2 -from layers import TacotronSTFT, STFT -# from audio_processing import griffin_lim -from train import load_model -from text import text_to_sequence -# from denoiser import Denoiser -import os -import soundfile as sf -import pyaudio -import klepto -import IPython.display as ipd -import time -from sia.file_utils import cached_model_path - -sys.path.append('waveglow/') -hparams = create_hparams() -hparams.sampling_rate = 22050 -model = load_model(hparams) -tacotron2_path = cached_model_path("tacotron2_model") -model.load_state_dict( - torch.load(tacotron2_path, map_location='cpu')['state_dict']) -model.eval() -waveglow_path = cached_model_path('waveglow_model') -waveglow = torch.load(waveglow_path, map_location='cpu')['model'] -waveglow.eval() -for k in waveglow.convinv: - k.float() -k_cache = klepto.archives.file_archive(cached=False) - -# https://github.com/NVIDIA/waveglow/issues/127 -for m in waveglow.modules(): - if 'Conv' in str(type(m)): - setattr(m, 'padding_mode', 'zeros') - - -def convert(array): - sf.write('sample.wav', array, 22050) - os.system('ffmpeg -i {0} -filter:a "atempo=0.80" -ar 16k {1}'.format( - 'sample.wav', 'sample0.wav')) - data, rate = sf.read('sample0.wav', dtype='int16') - os.remove('sample.wav') - os.remove('sample0.wav') - return data - - -@klepto.safe.inf_cache(cache=k_cache) -def speech(t): - start = time.time() - text = t - sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] - sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long() - mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) - with torch.no_grad(): - audio = waveglow.infer(mel_outputs_postnet, sigma=0.666) - # import ipdb; ipdb.set_trace() - data = convert(audio[0].data.cpu().numpy()) - # _audio_stream.write(data.astype('float32')) - # _audio_stream.write(data) - end = time.time() - print(end - start) - return data - - -def display(data): - aud = ipd.Audio(data, rate=16000) - return aud - - -def player_gen(): - audio_interface = pyaudio.PyAudio() - _audio_stream = audio_interface.open(format=pyaudio.paInt16, - channels=1, - rate=16000, - output=True) - - def play_device(data): - _audio_stream.write(data.tostring()) - # _audio_stream.close() - - return play_device - - -def synthesize_corpus(): - all_data = [] - for (i, line) in enumerate(open('corpus.txt').readlines()): - print('synthesizing... "{}"'.format(line.strip())) - data = speech(line.strip()) - sf.write('tts_{}.wav'.format(i), data, 16000) - all_data.append(data) - return all_data - - -def play_corpus(corpus_synths): - player = player_gen() - for d in corpus_synths: - player(d) - - -def main(): - # data = speech('Hi I am Sia. How may I help you today .'.lower()) - # audio_interface = pyaudio.PyAudio() - # _audio_stream = audio_interface.open(format=pyaudio.paInt16, - # channels=1, - # rate=16000, - # output=True) - # _audio_stream.write(data) - corpus_synth_data = synthesize_corpus() - play_corpus(corpus_synth_data) - import ipdb - ipdb.set_trace() - - -if __name__ == '__main__': - main() diff --git a/glow.py b/glow.py new file mode 100644 index 0000000..e5ce84a --- /dev/null +++ b/glow.py @@ -0,0 +1,311 @@ +# ***************************************************************************** +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ***************************************************************************** +import copy +import torch +from torch.autograd import Variable +import torch.nn.functional as F + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a+input_b + t_act = torch.nn.functional.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.nn.functional.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +class WaveGlowLoss(torch.nn.Module): + def __init__(self, sigma=1.0): + super(WaveGlowLoss, self).__init__() + self.sigma = sigma + + def forward(self, model_output): + z, log_s_list, log_det_W_list = model_output + for i, log_s in enumerate(log_s_list): + if i == 0: + log_s_total = torch.sum(log_s) + log_det_W_total = log_det_W_list[i] + else: + log_s_total = log_s_total + torch.sum(log_s) + log_det_W_total += log_det_W_list[i] + + loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total + return loss/(z.size(0)*z.size(1)*z.size(2)) + + +class Invertible1x1Conv(torch.nn.Module): + """ + The layer outputs both the convolution, and the log determinant + of its weight matrix. If reverse=True it does convolution with + inverse + """ + def __init__(self, c): + super(Invertible1x1Conv, self).__init__() + self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0, + bias=False) + + # Sample a random orthonormal matrix to initialize weights + W = torch.qr(torch.FloatTensor(c, c).normal_())[0] + + # Ensure determinant is 1.0 not -1.0 + if torch.det(W) < 0: + W[:,0] = -1*W[:,0] + W = W.view(c, c, 1) + self.conv.weight.data = W + + def forward(self, z, reverse=False): + # shape + batch_size, group_size, n_of_groups = z.size() + + W = self.conv.weight.squeeze() + + if reverse: + if not hasattr(self, 'W_inverse'): + # Reverse computation + W_inverse = W.inverse() + W_inverse = Variable(W_inverse[..., None]) + if z.type() == 'torch.cuda.HalfTensor': + W_inverse = W_inverse.half() + self.W_inverse = W_inverse + z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) + return z + else: + # Forward computation + log_det_W = batch_size * n_of_groups * torch.logdet(W) + z = self.conv(z) + return z, log_det_W + + +class WN(torch.nn.Module): + """ + This is the WaveNet like layer for the affine coupling. The primary difference + from WaveNet is the convolutions need not be causal. There is also no dilation + size reset. The dilation only doubles on each layer + """ + def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels, + kernel_size): + super(WN, self).__init__() + assert(kernel_size % 2 == 1) + assert(n_channels % 2 == 0) + self.n_layers = n_layers + self.n_channels = n_channels + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.cond_layers = torch.nn.ModuleList() + + start = torch.nn.Conv1d(n_in_channels, n_channels, 1) + start = torch.nn.utils.weight_norm(start, name='weight') + self.start = start + + # Initializing last layer to 0 makes the affine coupling layers + # do nothing at first. This helps with training stability + end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1) + end.weight.data.zero_() + end.bias.data.zero_() + self.end = end + + for i in range(n_layers): + dilation = 2 ** i + padding = int((kernel_size*dilation - dilation)/2) + in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size, + dilation=dilation, padding=padding) + in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') + self.in_layers.append(in_layer) + + cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels, 1) + cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') + self.cond_layers.append(cond_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2*n_channels + else: + res_skip_channels = n_channels + res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') + self.res_skip_layers.append(res_skip_layer) + + def forward(self, forward_input): + audio, spect = forward_input + audio = self.start(audio) + for i in range(self.n_layers): + acts = fused_add_tanh_sigmoid_multiply( + self.in_layers[i](audio), + self.cond_layers[i](spect), + torch.IntTensor([self.n_channels])) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + audio = res_skip_acts[:,:self.n_channels,:] + audio + skip_acts = res_skip_acts[:,self.n_channels:,:] + else: + skip_acts = res_skip_acts + + if i == 0: + output = skip_acts + else: + output = skip_acts + output + return self.end(output) + + +class WaveGlow(torch.nn.Module): + def __init__(self, n_mel_channels, n_flows, n_group, n_early_every, + n_early_size, WN_config): + super(WaveGlow, self).__init__() + + self.upsample = torch.nn.ConvTranspose1d(n_mel_channels, + n_mel_channels, + 1024, stride=256) + assert(n_group % 2 == 0) + self.n_flows = n_flows + self.n_group = n_group + self.n_early_every = n_early_every + self.n_early_size = n_early_size + self.WN = torch.nn.ModuleList() + self.convinv = torch.nn.ModuleList() + + n_half = int(n_group/2) + + # Set up layers with the right sizes based on how many dimensions + # have been output already + n_remaining_channels = n_group + for k in range(n_flows): + if k % self.n_early_every == 0 and k > 0: + n_half = n_half - int(self.n_early_size/2) + n_remaining_channels = n_remaining_channels - self.n_early_size + self.convinv.append(Invertible1x1Conv(n_remaining_channels)) + self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config)) + self.n_remaining_channels = n_remaining_channels # Useful during inference + + def forward(self, forward_input): + """ + forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames + forward_input[1] = audio: batch x time + """ + spect, audio = forward_input + + # Upsample spectrogram to size of audio + spect = self.upsample(spect) + assert(spect.size(2) >= audio.size(1)) + if spect.size(2) > audio.size(1): + spect = spect[:, :, :audio.size(1)] + + spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) + spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) + + audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1) + output_audio = [] + log_s_list = [] + log_det_W_list = [] + + for k in range(self.n_flows): + if k % self.n_early_every == 0 and k > 0: + output_audio.append(audio[:,:self.n_early_size,:]) + audio = audio[:,self.n_early_size:,:] + + audio, log_det_W = self.convinv[k](audio) + log_det_W_list.append(log_det_W) + + n_half = int(audio.size(1)/2) + audio_0 = audio[:,:n_half,:] + audio_1 = audio[:,n_half:,:] + + output = self.WN[k]((audio_0, spect)) + log_s = output[:, n_half:, :] + b = output[:, :n_half, :] + audio_1 = torch.exp(log_s)*audio_1 + b + log_s_list.append(log_s) + + audio = torch.cat([audio_0, audio_1],1) + + output_audio.append(audio) + return torch.cat(output_audio,1), log_s_list, log_det_W_list + + def infer(self, spect, sigma=1.0): + spect = self.upsample(spect) + # trim conv artifacts. maybe pad spec to kernel multiple + time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0] + spect = spect[:, :, :-time_cutoff] + + spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) + spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) + + if spect.type() == 'torch.cuda.HalfTensor': + audio = torch.cuda.HalfTensor(spect.size(0), + self.n_remaining_channels, + spect.size(2)).normal_() + else: + # cuda.FloatTensor -> FloatTensor + audio = torch.FloatTensor(spect.size(0), + self.n_remaining_channels, + spect.size(2)).normal_() + + audio = torch.autograd.Variable(sigma*audio) + + for k in reversed(range(self.n_flows)): + n_half = int(audio.size(1)/2) + audio_0 = audio[:,:n_half,:] + audio_1 = audio[:,n_half:,:] + + output = self.WN[k]((audio_0, spect)) + s = output[:, n_half:, :] + b = output[:, :n_half, :] + audio_1 = (audio_1 - b)/torch.exp(s) + audio = torch.cat([audio_0, audio_1],1) + + audio = self.convinv[k](audio, reverse=True) + + if k % self.n_early_every == 0 and k > 0: + if spect.type() == 'torch.cuda.HalfTensor': + z = torch.cuda.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_() + else: + # cuda.FloatTensor -> FloatTensor + z = torch.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_() + audio = torch.cat((sigma*z, audio),1) + + audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data + return audio + + @staticmethod + def remove_weightnorm(model): + waveglow = model + for WN in waveglow.WN: + WN.start = torch.nn.utils.remove_weight_norm(WN.start) + WN.in_layers = remove(WN.in_layers) + WN.cond_layers = remove(WN.cond_layers) + WN.res_skip_layers = remove(WN.res_skip_layers) + return waveglow + + +def remove(conv_list): + new_conv_list = torch.nn.ModuleList() + for old_conv in conv_list: + old_conv = torch.nn.utils.remove_weight_norm(old_conv) + new_conv_list.append(old_conv) + return new_conv_list diff --git a/loss_scaler.py b/loss_scaler.py deleted file mode 100644 index 88cc9cf..0000000 --- a/loss_scaler.py +++ /dev/null @@ -1,131 +0,0 @@ -import torch - -class LossScaler: - - def __init__(self, scale=1): - self.cur_scale = scale - - # `params` is a list / generator of torch.Variable - def has_overflow(self, params): - return False - - # `x` is a torch.Tensor - def _has_inf_or_nan(x): - return False - - # `overflow` is boolean indicating whether we overflowed in gradient - def update_scale(self, overflow): - pass - - @property - def loss_scale(self): - return self.cur_scale - - def scale_gradient(self, module, grad_in, grad_out): - return tuple(self.loss_scale * g for g in grad_in) - - def backward(self, loss): - scaled_loss = loss*self.loss_scale - scaled_loss.backward() - -class DynamicLossScaler: - - def __init__(self, - init_scale=2**32, - scale_factor=2., - scale_window=1000): - self.cur_scale = init_scale - self.cur_iter = 0 - self.last_overflow_iter = -1 - self.scale_factor = scale_factor - self.scale_window = scale_window - - # `params` is a list / generator of torch.Variable - def has_overflow(self, params): -# return False - for p in params: - if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data): - return True - - return False - - # `x` is a torch.Tensor - def _has_inf_or_nan(x): - cpu_sum = float(x.float().sum()) - if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum: - return True - return False - - # `overflow` is boolean indicating whether we overflowed in gradient - def update_scale(self, overflow): - if overflow: - #self.cur_scale /= self.scale_factor - self.cur_scale = max(self.cur_scale/self.scale_factor, 1) - self.last_overflow_iter = self.cur_iter - else: - if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0: - self.cur_scale *= self.scale_factor -# self.cur_scale = 1 - self.cur_iter += 1 - - @property - def loss_scale(self): - return self.cur_scale - - def scale_gradient(self, module, grad_in, grad_out): - return tuple(self.loss_scale * g for g in grad_in) - - def backward(self, loss): - scaled_loss = loss*self.loss_scale - scaled_loss.backward() - -############################################################## -# Example usage below here -- assuming it's in a separate file -############################################################## -if __name__ == "__main__": - import torch - from torch.autograd import Variable - from dynamic_loss_scaler import DynamicLossScaler - - # N is batch size; D_in is input dimension; - # H is hidden dimension; D_out is output dimension. - N, D_in, H, D_out = 64, 1000, 100, 10 - - # Create random Tensors to hold inputs and outputs, and wrap them in Variables. - x = Variable(torch.randn(N, D_in), requires_grad=False) - y = Variable(torch.randn(N, D_out), requires_grad=False) - - w1 = Variable(torch.randn(D_in, H), requires_grad=True) - w2 = Variable(torch.randn(H, D_out), requires_grad=True) - parameters = [w1, w2] - - learning_rate = 1e-6 - optimizer = torch.optim.SGD(parameters, lr=learning_rate) - loss_scaler = DynamicLossScaler() - - for t in range(500): - y_pred = x.mm(w1).clamp(min=0).mm(w2) - loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale - print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale)) - print('Iter {} scaled loss: {}'.format(t, loss.data[0])) - print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale)) - - # Run backprop - optimizer.zero_grad() - loss.backward() - - # Check for overflow - has_overflow = DynamicLossScaler.has_overflow(parameters) - - # If no overflow, unscale grad and update as usual - if not has_overflow: - for param in parameters: - param.grad.data.mul_(1. / loss_scaler.loss_scale) - optimizer.step() - # Otherwise, don't do anything -- ie, skip iteration - else: - print('OVERFLOW!') - - # Update loss scale for next iteration - loss_scaler.update_scale(has_overflow) - diff --git a/multiproc.py b/multiproc.py deleted file mode 100644 index 060ff93..0000000 --- a/multiproc.py +++ /dev/null @@ -1,23 +0,0 @@ -import time -import torch -import sys -import subprocess - -argslist = list(sys.argv)[1:] -num_gpus = torch.cuda.device_count() -argslist.append('--n_gpus={}'.format(num_gpus)) -workers = [] -job_id = time.strftime("%Y_%m_%d-%H%M%S") -argslist.append("--group_name=group_{}".format(job_id)) - -for i in range(num_gpus): - argslist.append('--rank={}'.format(i)) - stdout = None if i == 0 else open("logs/{}_GPU_{}.log".format(job_id, i), - "w") - print(argslist) - p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout) - workers.append(p) - argslist = argslist[:-1] - -for p in workers: - p.wait() diff --git a/tts.py b/tts.py index a946fa0..30b7cbe 100644 --- a/tts.py +++ b/tts.py @@ -1,30 +1,24 @@ #!/usr/bin/env python # coding: utf-8 -# import matplotlib -# import matplotlib.pylab as plt - -# import IPython.display as ipd - import sys import numpy as np import torch from hparams import create_hparams from model import Tacotron2 -from layers import TacotronSTFT, STFT -# from audio_processing import griffin_lim from train import load_model from text import text_to_sequence -# from denoiser import Denoiser import os import soundfile as sf import pyaudio import klepto -import IPython.display as ipd -import time +from librosa import resample +from librosa.effects import time_stretch from sia.file_utils import cached_model_path +from sia.instruments import do_time -sys.path.append('waveglow/') +TTS_SAMPLE_RATE = 22050 +OUTPUT_SAMPLE_RATE = 16000 class TTSModel(object): @@ -33,7 +27,7 @@ class TTSModel(object): def __init__(self): super(TTSModel, self).__init__() hparams = create_hparams() - hparams.sampling_rate = 22050 + hparams.sampling_rate = TTS_SAMPLE_RATE self.model = load_model(hparams) tacotron2_path = cached_model_path("tacotron2_model") self.model.load_state_dict( @@ -53,8 +47,8 @@ class TTSModel(object): if 'Conv' in str(type(m)): setattr(m, 'padding_mode', 'zeros') + @do_time def synth_speech(self, t): - start = time.time() text = t sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] @@ -62,18 +56,18 @@ class TTSModel(object): mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference( sequence) with torch.no_grad(): - audio = self.waveglow.infer(mel_outputs_postnet, sigma=0.666) - # import ipdb; ipdb.set_trace() - data = convert(audio[0].data.cpu().numpy()) - # _audio_stream.write(data.astype('float32')) - # _audio_stream.write(data) - end = time.time() - print(end - start) + audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666) + audio = audio_t[0].data.cpu().numpy() + # data = convert(audio) + slow_data = time_stretch(audio, 0.8) + float_data = resample(slow_data, TTS_SAMPLE_RATE, OUTPUT_SAMPLE_RATE) + data = float2pcm(float_data) return data.tobytes() def convert(array): - sf.write('sample.wav', array, 22050) + sf.write('sample.wav', array, TTS_SAMPLE_RATE) + # convert to $OUTPUT_SAMPLE_RATE os.system('ffmpeg -i {0} -filter:a "atempo=0.80" -ar 16k {1}'.format( 'sample.wav', 'sample0.wav')) data, rate = sf.read('sample0.wav', dtype='int16') @@ -82,7 +76,45 @@ def convert(array): return data +# https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py +def float2pcm(sig, dtype='int16'): + """Convert floating point signal with a range from -1 to 1 to PCM. + Any signal values outside the interval [-1.0, 1.0) are clipped. + No dithering is used. + Note that there are different possibilities for scaling floating + point numbers to PCM numbers, this function implements just one of + them. For an overview of alternatives see + http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html + Parameters + ---------- + sig : array_like + Input array, must have floating point type. + dtype : data type, optional + Desired (integer) data type. + Returns + ------- + numpy.ndarray + Integer data, scaled and clipped to the range of the given + *dtype*. + See Also + -------- + pcm2float, dtype + """ + sig = np.asarray(sig) + if sig.dtype.kind != 'f': + raise TypeError("'sig' must be a float array") + dtype = np.dtype(dtype) + if dtype.kind not in 'iu': + raise TypeError("'dtype' must be an integer type") + + i = np.iinfo(dtype) + abs_max = 2**(i.bits - 1) + offset = i.min + abs_max + return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype) + + def display(data): + import IPython.display as ipd aud = ipd.Audio(data, rate=16000) return aud @@ -91,7 +123,7 @@ def player_gen(): audio_interface = pyaudio.PyAudio() _audio_stream = audio_interface.open(format=pyaudio.paInt16, channels=1, - rate=16000, + rate=OUTPUT_SAMPLE_RATE, output=True) def play_device(data): diff --git a/waveglow b/waveglow deleted file mode 160000 index 4b1001f..0000000 --- a/waveglow +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4b1001fa3336a1184b8293745bb89b177457f09b