diff --git a/HISTORY.rst b/HISTORY.rst index e06aacb..163384e 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,7 +2,12 @@ History ======= +0.2.0 (2019-10-04) +------------------ +* Add Griffin Lim support +* Allow passing hyper-parameters to TTSModel. + + 0.1.0 (2019-09-20) ------------------ - * First release on PyPI. diff --git a/README.md b/README.md index f64a96b..918962c 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![image](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black) -> Generate speech audio from text +> Generates speech audio from text --- # Table of Contents @@ -13,22 +13,25 @@ # Features -* Tacotron2 Synthesized Speech +* TTS using Tacotron2 # Installation -Install the packages with for production use. It downloads the dependencies +To install the packages and its dependencies run. ```bash python setup.py install ``` +or with pip +```bash +pip install . +``` -> Still facing an issue? Check the [Issues](#issues) section or open a new issue. - -The installation should be smooth with Python 3.6 or newer. +The installation should work on Python 3.6 or newer. Untested on Python 2.7 # Usage -> API ```python -tts_model = TTSModel("/path/to/tacotron2_model","/path/to/waveglow_model") -SPEECH_AUDIO = tts_model.synth_speech(TEXT) +from taco2.tts import TTSModel +tts_model = TTSModel("/path/to/tacotron2_model","/path/to/waveglow_model") # Loads the models +SPEECH_AUDIO = tts_model.synth_speech(TEXT) # Returns the wav buffer ``` +If `'/path/to/waveglow_model'` is `None` *Griffin-Lim vocoder* will be used. diff --git a/setup.py b/setup.py index f07a6e6..1758a6d 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ packages = find_packages() setup( author="Malar Kannan", - author_email="malar@agaralabs.com", + author_email="malarkannan.invention@gmail.com", classifiers=[ "Development Status :: 2 - Pre-Alpha", "Intended Audience :: Developers", @@ -46,14 +46,14 @@ setup( install_requires=requirements, long_description=readme + "\n\n" + history, include_package_data=True, - keywords="tacotron2", + keywords="tacotron2 tts", name="taco2-tts", packages=packages, setup_requires=setup_requirements, test_suite="tests", tests_require=test_requirements, url="https://github.com/malarinv/tacotron2", - version="0.1.0", + version="0.2.0", zip_safe=False, - entry_points={"console_scripts": ("tts_debug = tts:main",)}, + entry_points={"console_scripts": ("tts_debug = taco2.tts:main",)}, ) diff --git a/taco2/tts.py b/taco2/tts.py index b8ef9c7..d054a90 100644 --- a/taco2/tts.py +++ b/taco2/tts.py @@ -46,7 +46,7 @@ class TTSModel(object): self.waveglow.load_state_dict(wave_params) self.waveglow.eval() except: - self.waveglow = wave_params['model'] + self.waveglow = wave_params["model"] self.waveglow = self.waveglow.remove_weightnorm(self.waveglow) self.waveglow.eval() # workaround from @@ -60,7 +60,6 @@ class TTSModel(object): self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(self.synth_speech) self.denoiser = Denoiser(self.waveglow) - def synth_speech(self, text): sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :] sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long() @@ -78,15 +77,23 @@ class TTSModel(object): data = float2pcm(float_data) return data.tobytes() - def synth_speech_algo(self,text,griffin_iters=60): + def synth_speech_algo(self, text, griffin_iters=60): sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :] sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long() mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(sequence) from .hparams import HParams from .layers import TacotronSTFT from .audio_processing import griffin_lim + hparams = HParams() - taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, n_mel_channels=hparams.n_mel_channels, sampling_rate=hparams.sampling_rate, mel_fmax=4000) + taco_stft = TacotronSTFT( + hparams.filter_length, + hparams.hop_length, + hparams.win_length, + n_mel_channels=hparams.n_mel_channels, + sampling_rate=hparams.sampling_rate, + mel_fmax=4000, + ) mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 @@ -94,7 +101,11 @@ class TTSModel(object): spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling - audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, griffin_iters) + audio = griffin_lim( + torch.autograd.Variable(spec_from_mel[:, :, :-1]), + taco_stft.stft_fn, + griffin_iters, + ) audio = audio.squeeze() audio = audio.cpu().numpy() @@ -102,6 +113,8 @@ class TTSModel(object): float_data = resample(slow_data, TTS_SAMPLE_RATE, OUTPUT_SAMPLE_RATE) data = float2pcm(float_data) return data.tobytes() + + # adapted from # https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py def float2pcm(sig, dtype="int16"): @@ -140,13 +153,6 @@ def float2pcm(sig, dtype="int16"): return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype) -def display(data): - import IPython.display as ipd - - aud = ipd.Audio(data, rate=16000) - return aud - - def player_gen(): audio_interface = pyaudio.PyAudio() _audio_stream = audio_interface.open( @@ -160,51 +166,22 @@ def player_gen(): return play_device -def synthesize_corpus(): - tts_model = TTSModel( - "/Users/malar/Work/tacotron2_statedict.pt", - "/Users/malar/Work/waveglow.pt", - ) - all_data = [] - for (i, line) in enumerate(open("corpus.txt").readlines()): - print(f'synthesizing... "{line.strip()}"') - data = tts_model.synth_speech(line.strip()) - all_data.append(data) - return all_data - def repl(): - tts_model = TTSModel( - # "/Users/malar/Work/tacotron2_statedict.pt", - # "/Users/malar/Work/tacotron2_80_22000.pt", - "/path/to/tacotron2.pt", - # "/Users/malar/Work/tacotron2_40_22000.pt", - # "/Users/malar/Work/tacotron2_16000.pt", - "/path/to/waveglow.pt", - # "/Users/malar/Work/waveglow.pt", - # "/Users/malar/Work/waveglow_38000", - ) + tts_model = TTSModel("/path/to/tacotron2.pt", "/path/to/waveglow.pt") player = player_gen() + def loop(): - text = input('tts >') + text = input("tts >") data = tts_model.synth_speech(text.strip()) player(data) + return loop -def play_corpus(corpus_synths): - player = player_gen() - for d in corpus_synths: - player(d) - - def main(): - # corpus_synth_data = synthesize_corpus() - # play_corpus(corpus_synth_data) interactive_loop = repl() while True: interactive_loop() - # import pdb - # pdb.set_trace() if __name__ == "__main__":