From a10a6d517e4e59fb389cf2c9b61e82cd141deafe Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Sat, 21 Sep 2019 01:19:30 +0530 Subject: [PATCH] packaged taco2 --- AUTHORS.rst | 8 ++ HISTORY.rst | 8 ++ MANIFEST.in | 10 +++ README.md | 34 +++++++ corpus.txt | 4 + hparams.py | 88 ------------------- requirements.txt | 2 + server.py | 38 -------- setup.cfg | 28 ++++++ setup.py | 59 +++++++++++++ __init__.py => taco2/__init__.py | 0 .../audio_processing.py | 0 data_utils.py => taco2/data_utils.py | 0 taco2/hparams.py | 76 ++++++++++++++++ layers.py => taco2/layers.py | 0 loss_function.py => taco2/loss_function.py | 0 model.py => taco2/model.py | 0 stft.py => taco2/stft.py | 0 {text => taco2/text}/LICENSE | 0 {text => taco2/text}/__init__.py | 0 {text => taco2/text}/cleaners.py | 0 {text => taco2/text}/cmudict.py | 0 {text => taco2/text}/numbers.py | 0 {text => taco2/text}/symbols.py | 0 tts.py => taco2/tts.py | 71 +++++++-------- utils.py => taco2/utils.py | 0 26 files changed, 265 insertions(+), 161 deletions(-) create mode 100644 AUTHORS.rst create mode 100644 HISTORY.rst create mode 100644 MANIFEST.in create mode 100644 README.md create mode 100644 corpus.txt delete mode 100644 hparams.py create mode 100644 requirements.txt delete mode 100644 server.py create mode 100644 setup.cfg create mode 100644 setup.py rename __init__.py => taco2/__init__.py (100%) rename audio_processing.py => taco2/audio_processing.py (100%) rename data_utils.py => taco2/data_utils.py (100%) create mode 100644 taco2/hparams.py rename layers.py => taco2/layers.py (100%) rename loss_function.py => taco2/loss_function.py (100%) rename model.py => taco2/model.py (100%) rename stft.py => taco2/stft.py (100%) rename {text => taco2/text}/LICENSE (100%) rename {text => taco2/text}/__init__.py (100%) rename {text => taco2/text}/cleaners.py (100%) rename {text => taco2/text}/cmudict.py (100%) rename {text => taco2/text}/numbers.py (100%) rename {text => taco2/text}/symbols.py (100%) rename tts.py => taco2/tts.py (80%) rename utils.py => taco2/utils.py (100%) diff --git a/AUTHORS.rst b/AUTHORS.rst new file mode 100644 index 0000000..f7cb4d4 --- /dev/null +++ b/AUTHORS.rst @@ -0,0 +1,8 @@ +======= +Credits +======= + +Contributors +------------ + +* Malar Kannan diff --git a/HISTORY.rst b/HISTORY.rst new file mode 100644 index 0000000..e06aacb --- /dev/null +++ b/HISTORY.rst @@ -0,0 +1,8 @@ +======= +History +======= + +0.1.0 (2019-09-20) +------------------ + +* First release on PyPI. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..36ea105 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,10 @@ +include AUTHORS.rst +include HISTORY.rst +include LICENSE +include README.md + +recursive-include tests * +recursive-exclude * __pycache__ +recursive-exclude * *.py[co] + +recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif diff --git a/README.md b/README.md new file mode 100644 index 0000000..f64a96b --- /dev/null +++ b/README.md @@ -0,0 +1,34 @@ +# Taco2 TTS + +[![image](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black) + +> Generate speech audio from text +--- + +# Table of Contents + +* [Features](#features) +* [Installation](#installation) +* [Usage](#usage) + +# Features + +* Tacotron2 Synthesized Speech + + +# Installation +Install the packages with for production use. It downloads the dependencies +```bash +python setup.py install +``` + +> Still facing an issue? Check the [Issues](#issues) section or open a new issue. + +The installation should be smooth with Python 3.6 or newer. + +# Usage +> API +```python +tts_model = TTSModel("/path/to/tacotron2_model","/path/to/waveglow_model") +SPEECH_AUDIO = tts_model.synth_speech(TEXT) +``` diff --git a/corpus.txt b/corpus.txt new file mode 100644 index 0000000..64314d7 --- /dev/null +++ b/corpus.txt @@ -0,0 +1,4 @@ +Hello world! +How have you been? +Today is a good day. +This seems to be working good diff --git a/hparams.py b/hparams.py deleted file mode 100644 index 58cf525..0000000 --- a/hparams.py +++ /dev/null @@ -1,88 +0,0 @@ -# -*- coding: utf-8 -*- -import tensorflow as tf -from .text import symbols - - -# changed path, sampling rate and batch size -def create_hparams(hparams_string=None, verbose=False): - """Create model hyperparameters. Parse nondefault from given string.""" - - hparams = tf.contrib.training.HParams( - ################################ - # Experiment Parameters # - ################################ - epochs=500, - iters_per_checkpoint=1000, - seed=1234, - dynamic_loss_scaling=True, - fp16_run=False, - distributed_run=False, - dist_backend="nccl", - dist_url="tcp://localhost:54321", - cudnn_enabled=True, - cudnn_benchmark=False, - ignore_layers=["embedding.weight"], - ################################ - # Data Parameters # - ################################ - load_mel_from_disk=False, - training_files="lists/tts_data_train_processed.txt", - validation_files="filelists/tts_data_val_processed.txt", - text_cleaners=["english_cleaners"], - ################################ - # Audio Parameters # - ################################ - max_wav_value=32768.0, - sampling_rate=16000, - filter_length=1024, - hop_length=256, - win_length=1024, - n_mel_channels=80, - mel_fmin=0.0, - mel_fmax=8000.0, - ################################ - # Model Parameters # - ################################ - n_symbols=len(symbols), - symbols_embedding_dim=512, - # Encoder parameters - encoder_kernel_size=5, - encoder_n_convolutions=3, - encoder_embedding_dim=512, - # Decoder parameters - n_frames_per_step=1, # currently only 1 is supported - decoder_rnn_dim=1024, - prenet_dim=256, - max_decoder_steps=1000, - gate_threshold=0.5, - p_attention_dropout=0.1, - p_decoder_dropout=0.1, - # Attention parameters - attention_rnn_dim=1024, - attention_dim=128, - # Location Layer parameters - attention_location_n_filters=32, - attention_location_kernel_size=31, - # Mel-post processing network parameters - postnet_embedding_dim=512, - postnet_kernel_size=5, - postnet_n_convolutions=5, - ################################ - # Optimization Hyperparameters # - ################################ - use_saved_learning_rate=False, - learning_rate=1e-3, - weight_decay=1e-6, - grad_clip_thresh=1.0, - batch_size=4, - mask_padding=True, # set model's padded outputs to padded values - ) - - if hparams_string: - tf.logging.info("Parsing command line hparams: %s", hparams_string) - hparams.parse(hparams_string) - - if verbose: - tf.logging.info("Final parsed hparams: %s", hparams.values()) - - return hparams diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ba4d322 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +numpy==1.17.2 +torch==1.2.0 diff --git a/server.py b/server.py deleted file mode 100644 index d41119a..0000000 --- a/server.py +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- -import grpc -import time -from sia.proto import tts_pb2 -from sia.proto import tts_pb2_grpc -from concurrent import futures -from .tts import TTSModel - - -class TTSServer: - def __init__(self): - self.tts_model = TTSModel() - - def TextToSpeechAPI(self, request, context): - while True: - input_text = request.text - speech_response = self.tts_model.synth_speech(input_text) - return tts_pb2.SpeechResponse(response=speech_response) - - -def main(): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) - tts_server = TTSServer() - tts_pb2_grpc.add_ServerServicer_to_server(tts_server, server) - server.add_insecure_port("localhost:50060") - server.start() - print("TTSServer started!") - - try: - while True: - time.sleep(10000) - except KeyboardInterrupt: - server.start() - # server.stop(0) - - -if __name__ == "__main__": - main() diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..82236da --- /dev/null +++ b/setup.cfg @@ -0,0 +1,28 @@ +[bumpversion] +current_version = 0.1.0 +commit = True +tag = True + +[bumpversion:file:setup.py] +search = version='{current_version}' +replace = version='{new_version}' + +[bumpversion:file:taco2/__init__.py] +search = __version__ = '{current_version}' +replace = __version__ = '{new_version}' + +[bdist_wheel] +universal = 1 + +[flake8] +exclude = docs + +[aliases] +# Define setup.py command aliases here +test = pytest + +[tool:pytest] +collect_ignore = ['setup.py'] + +[easy_install] +index-url = http://localhost:8080/simple diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f07a6e6 --- /dev/null +++ b/setup.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""The setup script.""" + +from setuptools import setup, find_packages + +with open("README.md") as readme_file: + readme = readme_file.read() + +with open("HISTORY.rst") as history_file: + history = history_file.read() + +requirements = [ + "klepto==0.1.6", + "numpy==1.16.4", + "inflect==0.2.5", + "librosa==0.6.0", + "scipy==1.3.0", + "Unidecode==1.0.22", + "torch==1.1.0", + "PyAudio==0.2.11" +] + +setup_requirements = ["pytest-runner"] + +test_requirements = ["pytest"] + +packages = find_packages() + +setup( + author="Malar Kannan", + author_email="malar@agaralabs.com", + classifiers=[ + "Development Status :: 2 - Pre-Alpha", + "Intended Audience :: Developers", + "Natural Language :: English", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + ], + description="Taco2 TTS package.", + install_requires=requirements, + long_description=readme + "\n\n" + history, + include_package_data=True, + keywords="tacotron2", + name="taco2-tts", + packages=packages, + setup_requires=setup_requirements, + test_suite="tests", + tests_require=test_requirements, + url="https://github.com/malarinv/tacotron2", + version="0.1.0", + zip_safe=False, + entry_points={"console_scripts": ("tts_debug = tts:main",)}, +) diff --git a/__init__.py b/taco2/__init__.py similarity index 100% rename from __init__.py rename to taco2/__init__.py diff --git a/audio_processing.py b/taco2/audio_processing.py similarity index 100% rename from audio_processing.py rename to taco2/audio_processing.py diff --git a/data_utils.py b/taco2/data_utils.py similarity index 100% rename from data_utils.py rename to taco2/data_utils.py diff --git a/taco2/hparams.py b/taco2/hparams.py new file mode 100644 index 0000000..1a126a7 --- /dev/null +++ b/taco2/hparams.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +# import tensorflow as tf +from dataclasses import dataclass +from .text import symbols + +@dataclass +class HParams(object): + """docstring for HParams.""" + ################################ + # Experiment Parameters # + ################################ + epochs=500 + iters_per_checkpoint=1000 + seed=1234 + dynamic_loss_scaling=True + fp16_run=False + distributed_run=False + dist_backend="nccl" + dist_url="tcp://localhost:54321" + cudnn_enabled=True + cudnn_benchmark=False + ignore_layers=["embedding.weight"] + ################################ + # Data Parameters # + ################################ + load_mel_from_disk=False + training_files="lists/tts_data_train_processed.txt" + validation_files="filelists/tts_data_val_processed.txt" + text_cleaners=["english_cleaners"] + ################################ + # Audio Parameters # + ################################ + max_wav_value=32768.0 + sampling_rate=16000 + filter_length=1024 + hop_length=256 + win_length=1024 + n_mel_channels=80 + mel_fmin=0.0 + mel_fmax=8000.0 + ################################ + # Model Parameters # + ################################ + n_symbols=len(symbols) + symbols_embedding_dim=512 + # Encoder parameters + encoder_kernel_size=5 + encoder_n_convolutions=3 + encoder_embedding_dim=512 + # Decoder parameters + n_frames_per_step=1 # currently only 1 is supported + decoder_rnn_dim=1024 + prenet_dim=256 + max_decoder_steps=1000 + gate_threshold=0.5 + p_attention_dropout=0.1 + p_decoder_dropout=0.1 + # Attention parameters + attention_rnn_dim=1024 + attention_dim=128 + # Location Layer parameters + attention_location_n_filters=32 + attention_location_kernel_size=31 + # Mel-post processing network parameters + postnet_embedding_dim=512 + postnet_kernel_size=5 + postnet_n_convolutions=5 + ################################ + # Optimization Hyperparameters # + ################################ + use_saved_learning_rate=False + learning_rate=1e-3 + weight_decay=1e-6 + grad_clip_thresh=1.0 + batch_size=4 + mask_padding=True # set model's padded outputs to padded values diff --git a/layers.py b/taco2/layers.py similarity index 100% rename from layers.py rename to taco2/layers.py diff --git a/loss_function.py b/taco2/loss_function.py similarity index 100% rename from loss_function.py rename to taco2/loss_function.py diff --git a/model.py b/taco2/model.py similarity index 100% rename from model.py rename to taco2/model.py diff --git a/stft.py b/taco2/stft.py similarity index 100% rename from stft.py rename to taco2/stft.py diff --git a/text/LICENSE b/taco2/text/LICENSE similarity index 100% rename from text/LICENSE rename to taco2/text/LICENSE diff --git a/text/__init__.py b/taco2/text/__init__.py similarity index 100% rename from text/__init__.py rename to taco2/text/__init__.py diff --git a/text/cleaners.py b/taco2/text/cleaners.py similarity index 100% rename from text/cleaners.py rename to taco2/text/cleaners.py diff --git a/text/cmudict.py b/taco2/text/cmudict.py similarity index 100% rename from text/cmudict.py rename to taco2/text/cmudict.py diff --git a/text/numbers.py b/taco2/text/numbers.py similarity index 100% rename from text/numbers.py rename to taco2/text/numbers.py diff --git a/text/symbols.py b/taco2/text/symbols.py similarity index 100% rename from text/symbols.py rename to taco2/text/symbols.py diff --git a/tts.py b/taco2/tts.py similarity index 80% rename from tts.py rename to taco2/tts.py index f9a97a8..b1e41a9 100644 --- a/tts.py +++ b/taco2/tts.py @@ -3,19 +3,14 @@ import numpy as np import torch -from .hparams import create_hparams -from .text import text_to_sequence -from .glow import WaveGlow - -# import os -# import soundfile as sf import pyaudio -import klepto from librosa import resample from librosa.effects import time_stretch -from sia.file_utils import cached_model_path -from sia.instruments import do_time +import klepto from .model import Tacotron2 +from glow import WaveGlow +from .hparams import HParams +from .text import text_to_sequence TTS_SAMPLE_RATE = 22050 OUTPUT_SAMPLE_RATE = 16000 @@ -35,43 +30,34 @@ WAVEGLOW_CONFIG = { class TTSModel(object): """docstring for TTSModel.""" - def __init__(self): + def __init__(self, tacotron2_path, waveglow_path): super(TTSModel, self).__init__() - hparams = create_hparams() + hparams = HParams() hparams.sampling_rate = TTS_SAMPLE_RATE self.model = Tacotron2(hparams) - tacotron2_path = cached_model_path("tacotron2_model") self.model.load_state_dict( torch.load(tacotron2_path, map_location="cpu")["state_dict"] ) self.model.eval() - waveglow_path = cached_model_path("waveglow_model") - self.waveglow = WaveGlow(**WAVEGLOW_CONFIG) wave_params = torch.load(waveglow_path, map_location="cpu") + self.waveglow = WaveGlow(**WAVEGLOW_CONFIG) self.waveglow.load_state_dict(wave_params) self.waveglow.eval() for k in self.waveglow.convinv: k.float() self.k_cache = klepto.archives.file_archive(cached=False) - self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)( - self.synth_speech - ) + self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(self.synth_speech) # workaround from # https://github.com/NVIDIA/waveglow/issues/127 for m in self.waveglow.modules(): if "Conv" in str(type(m)): setattr(m, "padding_mode", "zeros") - @do_time def synth_speech(self, t): text = t - sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[ - None, : - ] + sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :] sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long() - mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference( - sequence - ) + mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(sequence) with torch.no_grad(): audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666) audio = audio_t[0].data.cpu().numpy() @@ -130,10 +116,7 @@ def display(data): def player_gen(): audio_interface = pyaudio.PyAudio() _audio_stream = audio_interface.open( - format=pyaudio.paInt16, - channels=1, - rate=OUTPUT_SAMPLE_RATE, - output=True, + format=pyaudio.paInt16, channels=1, rate=OUTPUT_SAMPLE_RATE, output=True ) def play_device(data): @@ -144,14 +127,30 @@ def player_gen(): def synthesize_corpus(): - tts_model = TTSModel() + tts_model = TTSModel( + "/Users/malar/Work/tacotron2_statedict.pt", + "/Users/malar/Work/waveglow.pt", + ) all_data = [] for (i, line) in enumerate(open("corpus.txt").readlines()): - print('synthesizing... "{}"'.format(line.strip())) + print(f'synthesizing... "{line.strip()}"') data = tts_model.synth_speech(line.strip()) all_data.append(data) return all_data +def repl(): + tts_model = TTSModel( + "/Users/malar/Work/tacotron2_statedict.pt", + # "/Users/malar/Work/waveglow_256channels.pt", + "/Users/malar/Work/waveglow.pt", + ) + player = player_gen() + def loop(): + text = input('tts >') + data = tts_model.synth_speech(text.strip()) + player(data) + return loop + def play_corpus(corpus_synths): player = player_gen() @@ -160,11 +159,13 @@ def play_corpus(corpus_synths): def main(): - corpus_synth_data = synthesize_corpus() - play_corpus(corpus_synth_data) - import pdb - - pdb.set_trace() + # corpus_synth_data = synthesize_corpus() + # play_corpus(corpus_synth_data) + interactive_loop = repl() + while True: + interactive_loop() + # import pdb + # pdb.set_trace() if __name__ == "__main__": diff --git a/utils.py b/taco2/utils.py similarity index 100% rename from utils.py rename to taco2/utils.py