diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 051ba22..0000000
--- a/.gitmodules
+++ /dev/null
@@ -1,4 +0,0 @@
-[submodule "waveglow"]
- path = waveglow
- url = https://github.com/NVIDIA/waveglow
- branch = master
diff --git a/README.md b/README.md
old mode 100755
new mode 100644
index 14cfc78..a953b8f
--- a/README.md
+++ b/README.md
@@ -1,81 +1,23 @@
-# Tacotron 2 (without wavenet)
-
-PyTorch implementation of [Natural TTS Synthesis By Conditioning
-Wavenet On Mel Spectrogram Predictions](https://arxiv.org/pdf/1712.05884.pdf).
-
-This implementation includes **distributed** and **automatic mixed precision** support
-and uses the [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/).
-
-Distributed and Automatic Mixed Precision support relies on NVIDIA's [Apex] and [AMP].
-
-Visit our [website] for audio samples using our published [Tacotron 2] and
-[WaveGlow] models.
-
-
-
-
-## Pre-requisites
-1. NVIDIA GPU + CUDA cuDNN
## Setup
-1. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/)
-2. Clone this repo: `git clone https://github.com/NVIDIA/tacotron2.git`
-3. CD into this repo: `cd tacotron2`
-4. Initialize submodule: `git submodule init; git submodule update`
-5. Update .wav paths: `sed -i -- 's,DUMMY,ljs_dataset_folder/wavs,g' filelists/*.txt`
- - Alternatively, set `load_mel_from_disk=True` in `hparams.py` and update mel-spectrogram paths
-6. Install [PyTorch 1.0]
-7. Install [Apex]
-8. Install python requirements or build docker image
- - Install python requirements: `pip install -r requirements.txt`
+- clone the repo
-## Training
-1. `python train.py --output_directory=outdir --log_directory=logdir`
-2. (OPTIONAL) `tensorboard --logdir=outdir/logdir`
+`git clone https://github.com/agaralabs/tacotron2`
+- cd to `tacotron2` copy models from wolverine:
-## Training using a pre-trained model
-Training using a pre-trained model can lead to faster convergence
-By default, the dataset dependent text embedding layers are [ignored]
+`scp wolverine:/home/ubuntu/tacotron2/{checkpoint_15000,waveglow_256channels.pt} ./`
-1. Download our published [Tacotron 2] model
-2. `python train.py --output_directory=outdir --log_directory=logdir -c tacotron2_statedict.pt --warm_start`
+`scp wolverine:/home/ubuntu/tacotron2/waveglow ./`
-## Multi-GPU (distributed) and Automatic Mixed Precision Training
-1. `python -m multiproc train.py --output_directory=outdir --log_directory=logdir --hparams=distributed_run=True,fp16_run=True`
+**Wolverine Details:**
+```
+Host wolverine
+ Hostname 54.71.137.17
+ User ubuntu
+ IdentityFile ~/.ssh/id_hip_ml
+```
+install the dependencies
+`pip install requirements.txt`
-## Inference demo
-1. Download our published [Tacotron 2] model
-2. Download our published [WaveGlow] model
-3. `jupyter notebook --ip=127.0.0.1 --port=31337`
-4. Load inference.ipynb
-
-N.b. When performing Mel-Spectrogram to Audio synthesis, make sure Tacotron 2
-and the Mel decoder were trained on the same mel-spectrogram representation.
-
-
-## Related repos
-[WaveGlow](https://github.com/NVIDIA/WaveGlow) Faster than real time Flow-based
-Generative Network for Speech Synthesis
-
-[nv-wavenet](https://github.com/NVIDIA/nv-wavenet/) Faster than real time
-WaveNet.
-
-## Acknowledgements
-This implementation uses code from the following repos: [Keith
-Ito](https://github.com/keithito/tacotron/), [Prem
-Seetharaman](https://github.com/pseeth/pytorch-stft) as described in our code.
-
-We are inspired by [Ryuchi Yamamoto's](https://github.com/r9y9/tacotron_pytorch)
-Tacotron PyTorch implementation.
-
-We are thankful to the Tacotron 2 paper authors, specially Jonathan Shen, Yuxuan
-Wang and Zongheng Yang.
-
-
-[WaveGlow]: https://drive.google.com/file/d/1WsibBTsuRg_SF2Z6L6NFRTT-NjEy1oTx/view?usp=sharing
-[Tacotron 2]: https://drive.google.com/file/d/1c5ZTuT7J08wLUoVZ2KkUs_VdZuJ86ZqA/view?usp=sharing
-[pytorch 1.0]: https://github.com/pytorch/pytorch#installation
-[website]: https://nv-adlr.github.io/WaveGlow
-[ignored]: https://github.com/NVIDIA/tacotron2/blob/master/hparams.py#L22
-[Apex]: https://github.com/nvidia/apex
-[AMP]: https://github.com/NVIDIA/apex/tree/master/apex/amp
+## Running:
+`python final.py`
diff --git a/WORKFLOW.md b/WORKFLOW.md
deleted file mode 100644
index a953b8f..0000000
--- a/WORKFLOW.md
+++ /dev/null
@@ -1,23 +0,0 @@
-
-## Setup
-- clone the repo
-
-`git clone https://github.com/agaralabs/tacotron2`
-- cd to `tacotron2` copy models from wolverine:
-
-`scp wolverine:/home/ubuntu/tacotron2/{checkpoint_15000,waveglow_256channels.pt} ./`
-
-`scp wolverine:/home/ubuntu/tacotron2/waveglow ./`
-
-**Wolverine Details:**
-```
-Host wolverine
- Hostname 54.71.137.17
- User ubuntu
- IdentityFile ~/.ssh/id_hip_ml
-```
-install the dependencies
-`pip install requirements.txt`
-
-## Running:
-`python final.py`
diff --git a/demo_client.py b/demo_client.py
index bc2a12e..b00a53a 100644
--- a/demo_client.py
+++ b/demo_client.py
@@ -4,15 +4,23 @@ from sia.proto import tts_pb2_grpc
from tts import player_gen
-def main():
+def tts_player():
+ player = player_gen()
channel = grpc.insecure_channel('localhost:50060')
stub = tts_pb2_grpc.ServerStub(channel)
- test_text = tts_pb2.TextInput(text='How may I help you today?')
- speech = stub.TextToSpeechAPI(test_text)
- player = player_gen()
- player(speech.response)
- import pdb
- pdb.set_trace()
+
+ def play(t):
+ test_text = tts_pb2.TextInput(text=t)
+ speech = stub.TextToSpeechAPI(test_text)
+ player(speech.response)
+ return play
+
+
+def main():
+ play = tts_player()
+ play('How may I help you today?')
+ import ipdb
+ ipdb.set_trace()
if __name__ == '__main__':
diff --git a/final.ipynb b/final.ipynb
deleted file mode 100644
index 29926f3..0000000
--- a/final.ipynb
+++ /dev/null
@@ -1,232 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import matplotlib\n",
- "#%matplotlib inline\n",
- "import matplotlib.pylab as plt\n",
- "\n",
- "import IPython.display as ipd\n",
- "\n",
- "import sys\n",
- "sys.path.append('waveglow/')\n",
- "import numpy as np\n",
- "import torch\n",
- "\n",
- "from hparams import create_hparams\n",
- "from model import Tacotron2\n",
- "from layers import TacotronSTFT, STFT\n",
- "from audio_processing import griffin_lim\n",
- "from train import load_model\n",
- "from text import text_to_sequence\n",
- "from denoiser import Denoiser"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import os"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "hparams = create_hparams()\n",
- "hparams.sampling_rate = 22050\n",
- "checkpoint_path = \"checkpoint_15000\"\n",
- "model = load_model(hparams)\n",
- "model.load_state_dict(torch.load(checkpoint_path, map_location = 'cpu')['state_dict']) #added map_location = 'cpu'\n",
- "_ = model.eval() #it was originally model.cuda().eval().half()\n",
- "waveglow_path = 'waveglow_256channels.pt'\n",
- "waveglow = torch.load(waveglow_path, map_location = 'cpu')['model'] #added map_location = 'cpu'\n",
- "waveglow.eval() #originally waveglow.cuda().eval().half()\n",
- "for k in waveglow.convinv:\n",
- " k.float()\n",
- "#denoiser = Denoiser(waveglow)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import soundfile as sf"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import time"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def convert(array):\n",
- " sf.write('sample.wav', array, 22050)\n",
- " os.system('ffmpeg -i {0} -filter:a \"atempo=0.80\" {1}'.format('sample.wav', 'sample0.wav'))\n",
- " #os.system('ffmpeg -i {0} -ar 8000 {1}'.format('sample0.wav', 'sample1.wav'))\n",
- " data, rate = sf.read('sample0.wav')\n",
- " os.remove('sample.wav')\n",
- " os.remove('sample0.wav')\n",
- " #os.remove('sample1.wav')\n",
- " return data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def speech(t):\n",
- " start = time.time()\n",
- " text = t\n",
- " sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]\n",
- " print(sequence)\n",
- " sequence = torch.autograd.Variable(\n",
- " torch.from_numpy(sequence)).long() #originally torch.from_numpy(sequence)).cuda().long()\n",
- " mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)\n",
- " with torch.no_grad():\n",
- " audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)\n",
- " #audio_denoised = denoiser(audio, strength=0.01)[:, 0]\n",
- " data = convert(audio[0].data.cpu().numpy())\n",
- " #os.system('ffmpeg -i {0} -filter:a \"atempo=0.85\" {1}'.format('harvard_inference/audio/'+str(i)+'.wav', 'harvard_inference/audio_0.85/'+str(i)+'.wav'))\n",
- " aud = ipd.Audio(data, rate=22050)\n",
- " end = time.time()\n",
- " print(end-start)\n",
- " return aud"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "speech('I understand your frustration and disappointment. I am sorry that its happening and I would like to help prevent it in the future. What style of diapers did you buy? For instance, was it the snugglers, pull ups or baby dry.')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "from final import display,speech,play_device\n",
- "import pyaudio"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "15.046638011932373\n"
- ]
- }
- ],
- "source": [
- "data = speech('Thank you for calling Huggies. How may I help you today .')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "display(data)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [],
- "source": [
- "def play_device(data):\n",
- " audio_interface = pyaudio.PyAudio()\n",
- " _audio_stream = audio_interface.open(format=pyaudio.paInt16,channels=1, rate=16000,output=True)\n",
- " _audio_stream.write(data.tostring())\n",
- "# _audio_stream.close()\n",
- "play_device(data)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/final.py b/final.py
deleted file mode 100644
index 0b826b2..0000000
--- a/final.py
+++ /dev/null
@@ -1,126 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-
-# import matplotlib
-# import matplotlib.pylab as plt
-
-# import IPython.display as ipd
-
-import sys
-import numpy as np
-import torch
-from hparams import create_hparams
-from model import Tacotron2
-from layers import TacotronSTFT, STFT
-# from audio_processing import griffin_lim
-from train import load_model
-from text import text_to_sequence
-# from denoiser import Denoiser
-import os
-import soundfile as sf
-import pyaudio
-import klepto
-import IPython.display as ipd
-import time
-from sia.file_utils import cached_model_path
-
-sys.path.append('waveglow/')
-hparams = create_hparams()
-hparams.sampling_rate = 22050
-model = load_model(hparams)
-tacotron2_path = cached_model_path("tacotron2_model")
-model.load_state_dict(
- torch.load(tacotron2_path, map_location='cpu')['state_dict'])
-model.eval()
-waveglow_path = cached_model_path('waveglow_model')
-waveglow = torch.load(waveglow_path, map_location='cpu')['model']
-waveglow.eval()
-for k in waveglow.convinv:
- k.float()
-k_cache = klepto.archives.file_archive(cached=False)
-
-# https://github.com/NVIDIA/waveglow/issues/127
-for m in waveglow.modules():
- if 'Conv' in str(type(m)):
- setattr(m, 'padding_mode', 'zeros')
-
-
-def convert(array):
- sf.write('sample.wav', array, 22050)
- os.system('ffmpeg -i {0} -filter:a "atempo=0.80" -ar 16k {1}'.format(
- 'sample.wav', 'sample0.wav'))
- data, rate = sf.read('sample0.wav', dtype='int16')
- os.remove('sample.wav')
- os.remove('sample0.wav')
- return data
-
-
-@klepto.safe.inf_cache(cache=k_cache)
-def speech(t):
- start = time.time()
- text = t
- sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
- sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
- mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
- with torch.no_grad():
- audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
- # import ipdb; ipdb.set_trace()
- data = convert(audio[0].data.cpu().numpy())
- # _audio_stream.write(data.astype('float32'))
- # _audio_stream.write(data)
- end = time.time()
- print(end - start)
- return data
-
-
-def display(data):
- aud = ipd.Audio(data, rate=16000)
- return aud
-
-
-def player_gen():
- audio_interface = pyaudio.PyAudio()
- _audio_stream = audio_interface.open(format=pyaudio.paInt16,
- channels=1,
- rate=16000,
- output=True)
-
- def play_device(data):
- _audio_stream.write(data.tostring())
- # _audio_stream.close()
-
- return play_device
-
-
-def synthesize_corpus():
- all_data = []
- for (i, line) in enumerate(open('corpus.txt').readlines()):
- print('synthesizing... "{}"'.format(line.strip()))
- data = speech(line.strip())
- sf.write('tts_{}.wav'.format(i), data, 16000)
- all_data.append(data)
- return all_data
-
-
-def play_corpus(corpus_synths):
- player = player_gen()
- for d in corpus_synths:
- player(d)
-
-
-def main():
- # data = speech('Hi I am Sia. How may I help you today .'.lower())
- # audio_interface = pyaudio.PyAudio()
- # _audio_stream = audio_interface.open(format=pyaudio.paInt16,
- # channels=1,
- # rate=16000,
- # output=True)
- # _audio_stream.write(data)
- corpus_synth_data = synthesize_corpus()
- play_corpus(corpus_synth_data)
- import ipdb
- ipdb.set_trace()
-
-
-if __name__ == '__main__':
- main()
diff --git a/glow.py b/glow.py
new file mode 100644
index 0000000..e5ce84a
--- /dev/null
+++ b/glow.py
@@ -0,0 +1,311 @@
+# *****************************************************************************
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of the NVIDIA CORPORATION nor the
+# names of its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+import copy
+import torch
+from torch.autograd import Variable
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+ n_channels_int = n_channels[0]
+ in_act = input_a+input_b
+ t_act = torch.nn.functional.tanh(in_act[:, :n_channels_int, :])
+ s_act = torch.nn.functional.sigmoid(in_act[:, n_channels_int:, :])
+ acts = t_act * s_act
+ return acts
+
+
+class WaveGlowLoss(torch.nn.Module):
+ def __init__(self, sigma=1.0):
+ super(WaveGlowLoss, self).__init__()
+ self.sigma = sigma
+
+ def forward(self, model_output):
+ z, log_s_list, log_det_W_list = model_output
+ for i, log_s in enumerate(log_s_list):
+ if i == 0:
+ log_s_total = torch.sum(log_s)
+ log_det_W_total = log_det_W_list[i]
+ else:
+ log_s_total = log_s_total + torch.sum(log_s)
+ log_det_W_total += log_det_W_list[i]
+
+ loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total
+ return loss/(z.size(0)*z.size(1)*z.size(2))
+
+
+class Invertible1x1Conv(torch.nn.Module):
+ """
+ The layer outputs both the convolution, and the log determinant
+ of its weight matrix. If reverse=True it does convolution with
+ inverse
+ """
+ def __init__(self, c):
+ super(Invertible1x1Conv, self).__init__()
+ self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0,
+ bias=False)
+
+ # Sample a random orthonormal matrix to initialize weights
+ W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
+
+ # Ensure determinant is 1.0 not -1.0
+ if torch.det(W) < 0:
+ W[:,0] = -1*W[:,0]
+ W = W.view(c, c, 1)
+ self.conv.weight.data = W
+
+ def forward(self, z, reverse=False):
+ # shape
+ batch_size, group_size, n_of_groups = z.size()
+
+ W = self.conv.weight.squeeze()
+
+ if reverse:
+ if not hasattr(self, 'W_inverse'):
+ # Reverse computation
+ W_inverse = W.inverse()
+ W_inverse = Variable(W_inverse[..., None])
+ if z.type() == 'torch.cuda.HalfTensor':
+ W_inverse = W_inverse.half()
+ self.W_inverse = W_inverse
+ z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
+ return z
+ else:
+ # Forward computation
+ log_det_W = batch_size * n_of_groups * torch.logdet(W)
+ z = self.conv(z)
+ return z, log_det_W
+
+
+class WN(torch.nn.Module):
+ """
+ This is the WaveNet like layer for the affine coupling. The primary difference
+ from WaveNet is the convolutions need not be causal. There is also no dilation
+ size reset. The dilation only doubles on each layer
+ """
+ def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels,
+ kernel_size):
+ super(WN, self).__init__()
+ assert(kernel_size % 2 == 1)
+ assert(n_channels % 2 == 0)
+ self.n_layers = n_layers
+ self.n_channels = n_channels
+ self.in_layers = torch.nn.ModuleList()
+ self.res_skip_layers = torch.nn.ModuleList()
+ self.cond_layers = torch.nn.ModuleList()
+
+ start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
+ start = torch.nn.utils.weight_norm(start, name='weight')
+ self.start = start
+
+ # Initializing last layer to 0 makes the affine coupling layers
+ # do nothing at first. This helps with training stability
+ end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1)
+ end.weight.data.zero_()
+ end.bias.data.zero_()
+ self.end = end
+
+ for i in range(n_layers):
+ dilation = 2 ** i
+ padding = int((kernel_size*dilation - dilation)/2)
+ in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size,
+ dilation=dilation, padding=padding)
+ in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
+ self.in_layers.append(in_layer)
+
+ cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels, 1)
+ cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
+ self.cond_layers.append(cond_layer)
+
+ # last one is not necessary
+ if i < n_layers - 1:
+ res_skip_channels = 2*n_channels
+ else:
+ res_skip_channels = n_channels
+ res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1)
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
+ self.res_skip_layers.append(res_skip_layer)
+
+ def forward(self, forward_input):
+ audio, spect = forward_input
+ audio = self.start(audio)
+ for i in range(self.n_layers):
+ acts = fused_add_tanh_sigmoid_multiply(
+ self.in_layers[i](audio),
+ self.cond_layers[i](spect),
+ torch.IntTensor([self.n_channels]))
+
+ res_skip_acts = self.res_skip_layers[i](acts)
+ if i < self.n_layers - 1:
+ audio = res_skip_acts[:,:self.n_channels,:] + audio
+ skip_acts = res_skip_acts[:,self.n_channels:,:]
+ else:
+ skip_acts = res_skip_acts
+
+ if i == 0:
+ output = skip_acts
+ else:
+ output = skip_acts + output
+ return self.end(output)
+
+
+class WaveGlow(torch.nn.Module):
+ def __init__(self, n_mel_channels, n_flows, n_group, n_early_every,
+ n_early_size, WN_config):
+ super(WaveGlow, self).__init__()
+
+ self.upsample = torch.nn.ConvTranspose1d(n_mel_channels,
+ n_mel_channels,
+ 1024, stride=256)
+ assert(n_group % 2 == 0)
+ self.n_flows = n_flows
+ self.n_group = n_group
+ self.n_early_every = n_early_every
+ self.n_early_size = n_early_size
+ self.WN = torch.nn.ModuleList()
+ self.convinv = torch.nn.ModuleList()
+
+ n_half = int(n_group/2)
+
+ # Set up layers with the right sizes based on how many dimensions
+ # have been output already
+ n_remaining_channels = n_group
+ for k in range(n_flows):
+ if k % self.n_early_every == 0 and k > 0:
+ n_half = n_half - int(self.n_early_size/2)
+ n_remaining_channels = n_remaining_channels - self.n_early_size
+ self.convinv.append(Invertible1x1Conv(n_remaining_channels))
+ self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config))
+ self.n_remaining_channels = n_remaining_channels # Useful during inference
+
+ def forward(self, forward_input):
+ """
+ forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames
+ forward_input[1] = audio: batch x time
+ """
+ spect, audio = forward_input
+
+ # Upsample spectrogram to size of audio
+ spect = self.upsample(spect)
+ assert(spect.size(2) >= audio.size(1))
+ if spect.size(2) > audio.size(1):
+ spect = spect[:, :, :audio.size(1)]
+
+ spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
+ spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
+
+ audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1)
+ output_audio = []
+ log_s_list = []
+ log_det_W_list = []
+
+ for k in range(self.n_flows):
+ if k % self.n_early_every == 0 and k > 0:
+ output_audio.append(audio[:,:self.n_early_size,:])
+ audio = audio[:,self.n_early_size:,:]
+
+ audio, log_det_W = self.convinv[k](audio)
+ log_det_W_list.append(log_det_W)
+
+ n_half = int(audio.size(1)/2)
+ audio_0 = audio[:,:n_half,:]
+ audio_1 = audio[:,n_half:,:]
+
+ output = self.WN[k]((audio_0, spect))
+ log_s = output[:, n_half:, :]
+ b = output[:, :n_half, :]
+ audio_1 = torch.exp(log_s)*audio_1 + b
+ log_s_list.append(log_s)
+
+ audio = torch.cat([audio_0, audio_1],1)
+
+ output_audio.append(audio)
+ return torch.cat(output_audio,1), log_s_list, log_det_W_list
+
+ def infer(self, spect, sigma=1.0):
+ spect = self.upsample(spect)
+ # trim conv artifacts. maybe pad spec to kernel multiple
+ time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
+ spect = spect[:, :, :-time_cutoff]
+
+ spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
+ spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
+
+ if spect.type() == 'torch.cuda.HalfTensor':
+ audio = torch.cuda.HalfTensor(spect.size(0),
+ self.n_remaining_channels,
+ spect.size(2)).normal_()
+ else:
+ # cuda.FloatTensor -> FloatTensor
+ audio = torch.FloatTensor(spect.size(0),
+ self.n_remaining_channels,
+ spect.size(2)).normal_()
+
+ audio = torch.autograd.Variable(sigma*audio)
+
+ for k in reversed(range(self.n_flows)):
+ n_half = int(audio.size(1)/2)
+ audio_0 = audio[:,:n_half,:]
+ audio_1 = audio[:,n_half:,:]
+
+ output = self.WN[k]((audio_0, spect))
+ s = output[:, n_half:, :]
+ b = output[:, :n_half, :]
+ audio_1 = (audio_1 - b)/torch.exp(s)
+ audio = torch.cat([audio_0, audio_1],1)
+
+ audio = self.convinv[k](audio, reverse=True)
+
+ if k % self.n_early_every == 0 and k > 0:
+ if spect.type() == 'torch.cuda.HalfTensor':
+ z = torch.cuda.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
+ else:
+ # cuda.FloatTensor -> FloatTensor
+ z = torch.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
+ audio = torch.cat((sigma*z, audio),1)
+
+ audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
+ return audio
+
+ @staticmethod
+ def remove_weightnorm(model):
+ waveglow = model
+ for WN in waveglow.WN:
+ WN.start = torch.nn.utils.remove_weight_norm(WN.start)
+ WN.in_layers = remove(WN.in_layers)
+ WN.cond_layers = remove(WN.cond_layers)
+ WN.res_skip_layers = remove(WN.res_skip_layers)
+ return waveglow
+
+
+def remove(conv_list):
+ new_conv_list = torch.nn.ModuleList()
+ for old_conv in conv_list:
+ old_conv = torch.nn.utils.remove_weight_norm(old_conv)
+ new_conv_list.append(old_conv)
+ return new_conv_list
diff --git a/loss_scaler.py b/loss_scaler.py
deleted file mode 100644
index 88cc9cf..0000000
--- a/loss_scaler.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import torch
-
-class LossScaler:
-
- def __init__(self, scale=1):
- self.cur_scale = scale
-
- # `params` is a list / generator of torch.Variable
- def has_overflow(self, params):
- return False
-
- # `x` is a torch.Tensor
- def _has_inf_or_nan(x):
- return False
-
- # `overflow` is boolean indicating whether we overflowed in gradient
- def update_scale(self, overflow):
- pass
-
- @property
- def loss_scale(self):
- return self.cur_scale
-
- def scale_gradient(self, module, grad_in, grad_out):
- return tuple(self.loss_scale * g for g in grad_in)
-
- def backward(self, loss):
- scaled_loss = loss*self.loss_scale
- scaled_loss.backward()
-
-class DynamicLossScaler:
-
- def __init__(self,
- init_scale=2**32,
- scale_factor=2.,
- scale_window=1000):
- self.cur_scale = init_scale
- self.cur_iter = 0
- self.last_overflow_iter = -1
- self.scale_factor = scale_factor
- self.scale_window = scale_window
-
- # `params` is a list / generator of torch.Variable
- def has_overflow(self, params):
-# return False
- for p in params:
- if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
- return True
-
- return False
-
- # `x` is a torch.Tensor
- def _has_inf_or_nan(x):
- cpu_sum = float(x.float().sum())
- if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
- return True
- return False
-
- # `overflow` is boolean indicating whether we overflowed in gradient
- def update_scale(self, overflow):
- if overflow:
- #self.cur_scale /= self.scale_factor
- self.cur_scale = max(self.cur_scale/self.scale_factor, 1)
- self.last_overflow_iter = self.cur_iter
- else:
- if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
- self.cur_scale *= self.scale_factor
-# self.cur_scale = 1
- self.cur_iter += 1
-
- @property
- def loss_scale(self):
- return self.cur_scale
-
- def scale_gradient(self, module, grad_in, grad_out):
- return tuple(self.loss_scale * g for g in grad_in)
-
- def backward(self, loss):
- scaled_loss = loss*self.loss_scale
- scaled_loss.backward()
-
-##############################################################
-# Example usage below here -- assuming it's in a separate file
-##############################################################
-if __name__ == "__main__":
- import torch
- from torch.autograd import Variable
- from dynamic_loss_scaler import DynamicLossScaler
-
- # N is batch size; D_in is input dimension;
- # H is hidden dimension; D_out is output dimension.
- N, D_in, H, D_out = 64, 1000, 100, 10
-
- # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
- x = Variable(torch.randn(N, D_in), requires_grad=False)
- y = Variable(torch.randn(N, D_out), requires_grad=False)
-
- w1 = Variable(torch.randn(D_in, H), requires_grad=True)
- w2 = Variable(torch.randn(H, D_out), requires_grad=True)
- parameters = [w1, w2]
-
- learning_rate = 1e-6
- optimizer = torch.optim.SGD(parameters, lr=learning_rate)
- loss_scaler = DynamicLossScaler()
-
- for t in range(500):
- y_pred = x.mm(w1).clamp(min=0).mm(w2)
- loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
- print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
- print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
- print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
-
- # Run backprop
- optimizer.zero_grad()
- loss.backward()
-
- # Check for overflow
- has_overflow = DynamicLossScaler.has_overflow(parameters)
-
- # If no overflow, unscale grad and update as usual
- if not has_overflow:
- for param in parameters:
- param.grad.data.mul_(1. / loss_scaler.loss_scale)
- optimizer.step()
- # Otherwise, don't do anything -- ie, skip iteration
- else:
- print('OVERFLOW!')
-
- # Update loss scale for next iteration
- loss_scaler.update_scale(has_overflow)
-
diff --git a/multiproc.py b/multiproc.py
deleted file mode 100644
index 060ff93..0000000
--- a/multiproc.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import time
-import torch
-import sys
-import subprocess
-
-argslist = list(sys.argv)[1:]
-num_gpus = torch.cuda.device_count()
-argslist.append('--n_gpus={}'.format(num_gpus))
-workers = []
-job_id = time.strftime("%Y_%m_%d-%H%M%S")
-argslist.append("--group_name=group_{}".format(job_id))
-
-for i in range(num_gpus):
- argslist.append('--rank={}'.format(i))
- stdout = None if i == 0 else open("logs/{}_GPU_{}.log".format(job_id, i),
- "w")
- print(argslist)
- p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout)
- workers.append(p)
- argslist = argslist[:-1]
-
-for p in workers:
- p.wait()
diff --git a/tts.py b/tts.py
index a946fa0..30b7cbe 100644
--- a/tts.py
+++ b/tts.py
@@ -1,30 +1,24 @@
#!/usr/bin/env python
# coding: utf-8
-# import matplotlib
-# import matplotlib.pylab as plt
-
-# import IPython.display as ipd
-
import sys
import numpy as np
import torch
from hparams import create_hparams
from model import Tacotron2
-from layers import TacotronSTFT, STFT
-# from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
-# from denoiser import Denoiser
import os
import soundfile as sf
import pyaudio
import klepto
-import IPython.display as ipd
-import time
+from librosa import resample
+from librosa.effects import time_stretch
from sia.file_utils import cached_model_path
+from sia.instruments import do_time
-sys.path.append('waveglow/')
+TTS_SAMPLE_RATE = 22050
+OUTPUT_SAMPLE_RATE = 16000
class TTSModel(object):
@@ -33,7 +27,7 @@ class TTSModel(object):
def __init__(self):
super(TTSModel, self).__init__()
hparams = create_hparams()
- hparams.sampling_rate = 22050
+ hparams.sampling_rate = TTS_SAMPLE_RATE
self.model = load_model(hparams)
tacotron2_path = cached_model_path("tacotron2_model")
self.model.load_state_dict(
@@ -53,8 +47,8 @@ class TTSModel(object):
if 'Conv' in str(type(m)):
setattr(m, 'padding_mode', 'zeros')
+ @do_time
def synth_speech(self, t):
- start = time.time()
text = t
sequence = np.array(text_to_sequence(text,
['english_cleaners']))[None, :]
@@ -62,18 +56,18 @@ class TTSModel(object):
mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
sequence)
with torch.no_grad():
- audio = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
- # import ipdb; ipdb.set_trace()
- data = convert(audio[0].data.cpu().numpy())
- # _audio_stream.write(data.astype('float32'))
- # _audio_stream.write(data)
- end = time.time()
- print(end - start)
+ audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
+ audio = audio_t[0].data.cpu().numpy()
+ # data = convert(audio)
+ slow_data = time_stretch(audio, 0.8)
+ float_data = resample(slow_data, TTS_SAMPLE_RATE, OUTPUT_SAMPLE_RATE)
+ data = float2pcm(float_data)
return data.tobytes()
def convert(array):
- sf.write('sample.wav', array, 22050)
+ sf.write('sample.wav', array, TTS_SAMPLE_RATE)
+ # convert to $OUTPUT_SAMPLE_RATE
os.system('ffmpeg -i {0} -filter:a "atempo=0.80" -ar 16k {1}'.format(
'sample.wav', 'sample0.wav'))
data, rate = sf.read('sample0.wav', dtype='int16')
@@ -82,7 +76,45 @@ def convert(array):
return data
+# https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
+def float2pcm(sig, dtype='int16'):
+ """Convert floating point signal with a range from -1 to 1 to PCM.
+ Any signal values outside the interval [-1.0, 1.0) are clipped.
+ No dithering is used.
+ Note that there are different possibilities for scaling floating
+ point numbers to PCM numbers, this function implements just one of
+ them. For an overview of alternatives see
+ http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html
+ Parameters
+ ----------
+ sig : array_like
+ Input array, must have floating point type.
+ dtype : data type, optional
+ Desired (integer) data type.
+ Returns
+ -------
+ numpy.ndarray
+ Integer data, scaled and clipped to the range of the given
+ *dtype*.
+ See Also
+ --------
+ pcm2float, dtype
+ """
+ sig = np.asarray(sig)
+ if sig.dtype.kind != 'f':
+ raise TypeError("'sig' must be a float array")
+ dtype = np.dtype(dtype)
+ if dtype.kind not in 'iu':
+ raise TypeError("'dtype' must be an integer type")
+
+ i = np.iinfo(dtype)
+ abs_max = 2**(i.bits - 1)
+ offset = i.min + abs_max
+ return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
+
+
def display(data):
+ import IPython.display as ipd
aud = ipd.Audio(data, rate=16000)
return aud
@@ -91,7 +123,7 @@ def player_gen():
audio_interface = pyaudio.PyAudio()
_audio_stream = audio_interface.open(format=pyaudio.paInt16,
channels=1,
- rate=16000,
+ rate=OUTPUT_SAMPLE_RATE,
output=True)
def play_device(data):
diff --git a/waveglow b/waveglow
deleted file mode 160000
index 4b1001f..0000000
--- a/waveglow
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 4b1001fa3336a1184b8293745bb89b177457f09b