1. clean-up

2. update readme and release info
master
Malar Kannan 2019-10-04 16:15:30 +05:30
parent 36c731cad0
commit edbb1bd57d
4 changed files with 44 additions and 59 deletions

View File

@ -2,7 +2,12 @@
History History
======= =======
0.2.0 (2019-10-04)
------------------
* Add Griffin Lim support
* Allow passing hyper-parameters to TTSModel.
0.1.0 (2019-09-20) 0.1.0 (2019-09-20)
------------------ ------------------
* First release on PyPI. * First release on PyPI.

View File

@ -2,7 +2,7 @@
[![image](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black) [![image](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black)
> Generate speech audio from text > Generates speech audio from text
--- ---
# Table of Contents # Table of Contents
@ -13,22 +13,25 @@
# Features # Features
* Tacotron2 Synthesized Speech * TTS using Tacotron2
# Installation # Installation
Install the packages with for production use. It downloads the dependencies To install the packages and its dependencies run.
```bash ```bash
python setup.py install python setup.py install
``` ```
or with pip
```bash
pip install .
```
> Still facing an issue? Check the [Issues](#issues) section or open a new issue. The installation should work on Python 3.6 or newer. Untested on Python 2.7
The installation should be smooth with Python 3.6 or newer.
# Usage # Usage
> API
```python ```python
tts_model = TTSModel("/path/to/tacotron2_model","/path/to/waveglow_model") from taco2.tts import TTSModel
SPEECH_AUDIO = tts_model.synth_speech(TEXT) tts_model = TTSModel("/path/to/tacotron2_model","/path/to/waveglow_model") # Loads the models
SPEECH_AUDIO = tts_model.synth_speech(TEXT) # Returns the wav buffer
``` ```
If `'/path/to/waveglow_model'` is `None` *Griffin-Lim vocoder* will be used.

View File

@ -29,7 +29,7 @@ packages = find_packages()
setup( setup(
author="Malar Kannan", author="Malar Kannan",
author_email="malar@agaralabs.com", author_email="malarkannan.invention@gmail.com",
classifiers=[ classifiers=[
"Development Status :: 2 - Pre-Alpha", "Development Status :: 2 - Pre-Alpha",
"Intended Audience :: Developers", "Intended Audience :: Developers",
@ -46,14 +46,14 @@ setup(
install_requires=requirements, install_requires=requirements,
long_description=readme + "\n\n" + history, long_description=readme + "\n\n" + history,
include_package_data=True, include_package_data=True,
keywords="tacotron2", keywords="tacotron2 tts",
name="taco2-tts", name="taco2-tts",
packages=packages, packages=packages,
setup_requires=setup_requirements, setup_requires=setup_requirements,
test_suite="tests", test_suite="tests",
tests_require=test_requirements, tests_require=test_requirements,
url="https://github.com/malarinv/tacotron2", url="https://github.com/malarinv/tacotron2",
version="0.1.0", version="0.2.0",
zip_safe=False, zip_safe=False,
entry_points={"console_scripts": ("tts_debug = tts:main",)}, entry_points={"console_scripts": ("tts_debug = taco2.tts:main",)},
) )

View File

@ -46,7 +46,7 @@ class TTSModel(object):
self.waveglow.load_state_dict(wave_params) self.waveglow.load_state_dict(wave_params)
self.waveglow.eval() self.waveglow.eval()
except: except:
self.waveglow = wave_params['model'] self.waveglow = wave_params["model"]
self.waveglow = self.waveglow.remove_weightnorm(self.waveglow) self.waveglow = self.waveglow.remove_weightnorm(self.waveglow)
self.waveglow.eval() self.waveglow.eval()
# workaround from # workaround from
@ -60,7 +60,6 @@ class TTSModel(object):
self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(self.synth_speech) self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(self.synth_speech)
self.denoiser = Denoiser(self.waveglow) self.denoiser = Denoiser(self.waveglow)
def synth_speech(self, text): def synth_speech(self, text):
sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :] sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :]
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long() sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
@ -78,15 +77,23 @@ class TTSModel(object):
data = float2pcm(float_data) data = float2pcm(float_data)
return data.tobytes() return data.tobytes()
def synth_speech_algo(self,text,griffin_iters=60): def synth_speech_algo(self, text, griffin_iters=60):
sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :] sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :]
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long() sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(sequence) mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(sequence)
from .hparams import HParams from .hparams import HParams
from .layers import TacotronSTFT from .layers import TacotronSTFT
from .audio_processing import griffin_lim from .audio_processing import griffin_lim
hparams = HParams() hparams = HParams()
taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, n_mel_channels=hparams.n_mel_channels, sampling_rate=hparams.sampling_rate, mel_fmax=4000) taco_stft = TacotronSTFT(
hparams.filter_length,
hparams.hop_length,
hparams.win_length,
n_mel_channels=hparams.n_mel_channels,
sampling_rate=hparams.sampling_rate,
mel_fmax=4000,
)
mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet) mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
mel_decompress = mel_decompress.transpose(1, 2).data.cpu() mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
spec_from_mel_scaling = 1000 spec_from_mel_scaling = 1000
@ -94,7 +101,11 @@ class TTSModel(object):
spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
spec_from_mel = spec_from_mel * spec_from_mel_scaling spec_from_mel = spec_from_mel * spec_from_mel_scaling
audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, griffin_iters) audio = griffin_lim(
torch.autograd.Variable(spec_from_mel[:, :, :-1]),
taco_stft.stft_fn,
griffin_iters,
)
audio = audio.squeeze() audio = audio.squeeze()
audio = audio.cpu().numpy() audio = audio.cpu().numpy()
@ -102,6 +113,8 @@ class TTSModel(object):
float_data = resample(slow_data, TTS_SAMPLE_RATE, OUTPUT_SAMPLE_RATE) float_data = resample(slow_data, TTS_SAMPLE_RATE, OUTPUT_SAMPLE_RATE)
data = float2pcm(float_data) data = float2pcm(float_data)
return data.tobytes() return data.tobytes()
# adapted from # adapted from
# https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py # https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
def float2pcm(sig, dtype="int16"): def float2pcm(sig, dtype="int16"):
@ -140,13 +153,6 @@ def float2pcm(sig, dtype="int16"):
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype) return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
def display(data):
import IPython.display as ipd
aud = ipd.Audio(data, rate=16000)
return aud
def player_gen(): def player_gen():
audio_interface = pyaudio.PyAudio() audio_interface = pyaudio.PyAudio()
_audio_stream = audio_interface.open( _audio_stream = audio_interface.open(
@ -160,51 +166,22 @@ def player_gen():
return play_device return play_device
def synthesize_corpus():
tts_model = TTSModel(
"/Users/malar/Work/tacotron2_statedict.pt",
"/Users/malar/Work/waveglow.pt",
)
all_data = []
for (i, line) in enumerate(open("corpus.txt").readlines()):
print(f'synthesizing... "{line.strip()}"')
data = tts_model.synth_speech(line.strip())
all_data.append(data)
return all_data
def repl(): def repl():
tts_model = TTSModel( tts_model = TTSModel("/path/to/tacotron2.pt", "/path/to/waveglow.pt")
# "/Users/malar/Work/tacotron2_statedict.pt",
# "/Users/malar/Work/tacotron2_80_22000.pt",
"/path/to/tacotron2.pt",
# "/Users/malar/Work/tacotron2_40_22000.pt",
# "/Users/malar/Work/tacotron2_16000.pt",
"/path/to/waveglow.pt",
# "/Users/malar/Work/waveglow.pt",
# "/Users/malar/Work/waveglow_38000",
)
player = player_gen() player = player_gen()
def loop(): def loop():
text = input('tts >') text = input("tts >")
data = tts_model.synth_speech(text.strip()) data = tts_model.synth_speech(text.strip())
player(data) player(data)
return loop return loop
def play_corpus(corpus_synths):
player = player_gen()
for d in corpus_synths:
player(d)
def main(): def main():
# corpus_synth_data = synthesize_corpus()
# play_corpus(corpus_synth_data)
interactive_loop = repl() interactive_loop = repl()
while True: while True:
interactive_loop() interactive_loop()
# import pdb
# pdb.set_trace()
if __name__ == "__main__": if __name__ == "__main__":