diff --git a/AUTHORS.rst b/AUTHORS.rst new file mode 100644 index 0000000..f7cb4d4 --- /dev/null +++ b/AUTHORS.rst @@ -0,0 +1,8 @@ +======= +Credits +======= + +Contributors +------------ + +* Malar Kannan diff --git a/HISTORY.rst b/HISTORY.rst new file mode 100644 index 0000000..e06aacb --- /dev/null +++ b/HISTORY.rst @@ -0,0 +1,8 @@ +======= +History +======= + +0.1.0 (2019-09-20) +------------------ + +* First release on PyPI. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..36ea105 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,10 @@ +include AUTHORS.rst +include HISTORY.rst +include LICENSE +include README.md + +recursive-include tests * +recursive-exclude * __pycache__ +recursive-exclude * *.py[co] + +recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif diff --git a/README.md b/README.md new file mode 100644 index 0000000..f64a96b --- /dev/null +++ b/README.md @@ -0,0 +1,34 @@ +# Taco2 TTS + +[![image](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black) + +> Generate speech audio from text +--- + +# Table of Contents + +* [Features](#features) +* [Installation](#installation) +* [Usage](#usage) + +# Features + +* Tacotron2 Synthesized Speech + + +# Installation +Install the packages with for production use. It downloads the dependencies +```bash +python setup.py install +``` + +> Still facing an issue? Check the [Issues](#issues) section or open a new issue. + +The installation should be smooth with Python 3.6 or newer. + +# Usage +> API +```python +tts_model = TTSModel("/path/to/tacotron2_model","/path/to/waveglow_model") +SPEECH_AUDIO = tts_model.synth_speech(TEXT) +``` diff --git a/corpus.txt b/corpus.txt new file mode 100644 index 0000000..64314d7 --- /dev/null +++ b/corpus.txt @@ -0,0 +1,4 @@ +Hello world! +How have you been? +Today is a good day. +This seems to be working good diff --git a/hparams.py b/hparams.py deleted file mode 100644 index 58cf525..0000000 --- a/hparams.py +++ /dev/null @@ -1,88 +0,0 @@ -# -*- coding: utf-8 -*- -import tensorflow as tf -from .text import symbols - - -# changed path, sampling rate and batch size -def create_hparams(hparams_string=None, verbose=False): - """Create model hyperparameters. Parse nondefault from given string.""" - - hparams = tf.contrib.training.HParams( - ################################ - # Experiment Parameters # - ################################ - epochs=500, - iters_per_checkpoint=1000, - seed=1234, - dynamic_loss_scaling=True, - fp16_run=False, - distributed_run=False, - dist_backend="nccl", - dist_url="tcp://localhost:54321", - cudnn_enabled=True, - cudnn_benchmark=False, - ignore_layers=["embedding.weight"], - ################################ - # Data Parameters # - ################################ - load_mel_from_disk=False, - training_files="lists/tts_data_train_processed.txt", - validation_files="filelists/tts_data_val_processed.txt", - text_cleaners=["english_cleaners"], - ################################ - # Audio Parameters # - ################################ - max_wav_value=32768.0, - sampling_rate=16000, - filter_length=1024, - hop_length=256, - win_length=1024, - n_mel_channels=80, - mel_fmin=0.0, - mel_fmax=8000.0, - ################################ - # Model Parameters # - ################################ - n_symbols=len(symbols), - symbols_embedding_dim=512, - # Encoder parameters - encoder_kernel_size=5, - encoder_n_convolutions=3, - encoder_embedding_dim=512, - # Decoder parameters - n_frames_per_step=1, # currently only 1 is supported - decoder_rnn_dim=1024, - prenet_dim=256, - max_decoder_steps=1000, - gate_threshold=0.5, - p_attention_dropout=0.1, - p_decoder_dropout=0.1, - # Attention parameters - attention_rnn_dim=1024, - attention_dim=128, - # Location Layer parameters - attention_location_n_filters=32, - attention_location_kernel_size=31, - # Mel-post processing network parameters - postnet_embedding_dim=512, - postnet_kernel_size=5, - postnet_n_convolutions=5, - ################################ - # Optimization Hyperparameters # - ################################ - use_saved_learning_rate=False, - learning_rate=1e-3, - weight_decay=1e-6, - grad_clip_thresh=1.0, - batch_size=4, - mask_padding=True, # set model's padded outputs to padded values - ) - - if hparams_string: - tf.logging.info("Parsing command line hparams: %s", hparams_string) - hparams.parse(hparams_string) - - if verbose: - tf.logging.info("Final parsed hparams: %s", hparams.values()) - - return hparams diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ba4d322 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +numpy==1.17.2 +torch==1.2.0 diff --git a/server.py b/server.py deleted file mode 100644 index d41119a..0000000 --- a/server.py +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- -import grpc -import time -from sia.proto import tts_pb2 -from sia.proto import tts_pb2_grpc -from concurrent import futures -from .tts import TTSModel - - -class TTSServer: - def __init__(self): - self.tts_model = TTSModel() - - def TextToSpeechAPI(self, request, context): - while True: - input_text = request.text - speech_response = self.tts_model.synth_speech(input_text) - return tts_pb2.SpeechResponse(response=speech_response) - - -def main(): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) - tts_server = TTSServer() - tts_pb2_grpc.add_ServerServicer_to_server(tts_server, server) - server.add_insecure_port("localhost:50060") - server.start() - print("TTSServer started!") - - try: - while True: - time.sleep(10000) - except KeyboardInterrupt: - server.start() - # server.stop(0) - - -if __name__ == "__main__": - main() diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..82236da --- /dev/null +++ b/setup.cfg @@ -0,0 +1,28 @@ +[bumpversion] +current_version = 0.1.0 +commit = True +tag = True + +[bumpversion:file:setup.py] +search = version='{current_version}' +replace = version='{new_version}' + +[bumpversion:file:taco2/__init__.py] +search = __version__ = '{current_version}' +replace = __version__ = '{new_version}' + +[bdist_wheel] +universal = 1 + +[flake8] +exclude = docs + +[aliases] +# Define setup.py command aliases here +test = pytest + +[tool:pytest] +collect_ignore = ['setup.py'] + +[easy_install] +index-url = http://localhost:8080/simple diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f07a6e6 --- /dev/null +++ b/setup.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""The setup script.""" + +from setuptools import setup, find_packages + +with open("README.md") as readme_file: + readme = readme_file.read() + +with open("HISTORY.rst") as history_file: + history = history_file.read() + +requirements = [ + "klepto==0.1.6", + "numpy==1.16.4", + "inflect==0.2.5", + "librosa==0.6.0", + "scipy==1.3.0", + "Unidecode==1.0.22", + "torch==1.1.0", + "PyAudio==0.2.11" +] + +setup_requirements = ["pytest-runner"] + +test_requirements = ["pytest"] + +packages = find_packages() + +setup( + author="Malar Kannan", + author_email="malar@agaralabs.com", + classifiers=[ + "Development Status :: 2 - Pre-Alpha", + "Intended Audience :: Developers", + "Natural Language :: English", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + ], + description="Taco2 TTS package.", + install_requires=requirements, + long_description=readme + "\n\n" + history, + include_package_data=True, + keywords="tacotron2", + name="taco2-tts", + packages=packages, + setup_requires=setup_requirements, + test_suite="tests", + tests_require=test_requirements, + url="https://github.com/malarinv/tacotron2", + version="0.1.0", + zip_safe=False, + entry_points={"console_scripts": ("tts_debug = tts:main",)}, +) diff --git a/__init__.py b/taco2/__init__.py similarity index 100% rename from __init__.py rename to taco2/__init__.py diff --git a/audio_processing.py b/taco2/audio_processing.py similarity index 100% rename from audio_processing.py rename to taco2/audio_processing.py diff --git a/data_utils.py b/taco2/data_utils.py similarity index 100% rename from data_utils.py rename to taco2/data_utils.py diff --git a/taco2/hparams.py b/taco2/hparams.py new file mode 100644 index 0000000..1a126a7 --- /dev/null +++ b/taco2/hparams.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +# import tensorflow as tf +from dataclasses import dataclass +from .text import symbols + +@dataclass +class HParams(object): + """docstring for HParams.""" + ################################ + # Experiment Parameters # + ################################ + epochs=500 + iters_per_checkpoint=1000 + seed=1234 + dynamic_loss_scaling=True + fp16_run=False + distributed_run=False + dist_backend="nccl" + dist_url="tcp://localhost:54321" + cudnn_enabled=True + cudnn_benchmark=False + ignore_layers=["embedding.weight"] + ################################ + # Data Parameters # + ################################ + load_mel_from_disk=False + training_files="lists/tts_data_train_processed.txt" + validation_files="filelists/tts_data_val_processed.txt" + text_cleaners=["english_cleaners"] + ################################ + # Audio Parameters # + ################################ + max_wav_value=32768.0 + sampling_rate=16000 + filter_length=1024 + hop_length=256 + win_length=1024 + n_mel_channels=80 + mel_fmin=0.0 + mel_fmax=8000.0 + ################################ + # Model Parameters # + ################################ + n_symbols=len(symbols) + symbols_embedding_dim=512 + # Encoder parameters + encoder_kernel_size=5 + encoder_n_convolutions=3 + encoder_embedding_dim=512 + # Decoder parameters + n_frames_per_step=1 # currently only 1 is supported + decoder_rnn_dim=1024 + prenet_dim=256 + max_decoder_steps=1000 + gate_threshold=0.5 + p_attention_dropout=0.1 + p_decoder_dropout=0.1 + # Attention parameters + attention_rnn_dim=1024 + attention_dim=128 + # Location Layer parameters + attention_location_n_filters=32 + attention_location_kernel_size=31 + # Mel-post processing network parameters + postnet_embedding_dim=512 + postnet_kernel_size=5 + postnet_n_convolutions=5 + ################################ + # Optimization Hyperparameters # + ################################ + use_saved_learning_rate=False + learning_rate=1e-3 + weight_decay=1e-6 + grad_clip_thresh=1.0 + batch_size=4 + mask_padding=True # set model's padded outputs to padded values diff --git a/layers.py b/taco2/layers.py similarity index 100% rename from layers.py rename to taco2/layers.py diff --git a/loss_function.py b/taco2/loss_function.py similarity index 100% rename from loss_function.py rename to taco2/loss_function.py diff --git a/model.py b/taco2/model.py similarity index 100% rename from model.py rename to taco2/model.py diff --git a/stft.py b/taco2/stft.py similarity index 100% rename from stft.py rename to taco2/stft.py diff --git a/text/LICENSE b/taco2/text/LICENSE similarity index 100% rename from text/LICENSE rename to taco2/text/LICENSE diff --git a/text/__init__.py b/taco2/text/__init__.py similarity index 100% rename from text/__init__.py rename to taco2/text/__init__.py diff --git a/text/cleaners.py b/taco2/text/cleaners.py similarity index 100% rename from text/cleaners.py rename to taco2/text/cleaners.py diff --git a/text/cmudict.py b/taco2/text/cmudict.py similarity index 100% rename from text/cmudict.py rename to taco2/text/cmudict.py diff --git a/text/numbers.py b/taco2/text/numbers.py similarity index 100% rename from text/numbers.py rename to taco2/text/numbers.py diff --git a/text/symbols.py b/taco2/text/symbols.py similarity index 100% rename from text/symbols.py rename to taco2/text/symbols.py diff --git a/tts.py b/taco2/tts.py similarity index 80% rename from tts.py rename to taco2/tts.py index f9a97a8..b1e41a9 100644 --- a/tts.py +++ b/taco2/tts.py @@ -3,19 +3,14 @@ import numpy as np import torch -from .hparams import create_hparams -from .text import text_to_sequence -from .glow import WaveGlow - -# import os -# import soundfile as sf import pyaudio -import klepto from librosa import resample from librosa.effects import time_stretch -from sia.file_utils import cached_model_path -from sia.instruments import do_time +import klepto from .model import Tacotron2 +from glow import WaveGlow +from .hparams import HParams +from .text import text_to_sequence TTS_SAMPLE_RATE = 22050 OUTPUT_SAMPLE_RATE = 16000 @@ -35,43 +30,34 @@ WAVEGLOW_CONFIG = { class TTSModel(object): """docstring for TTSModel.""" - def __init__(self): + def __init__(self, tacotron2_path, waveglow_path): super(TTSModel, self).__init__() - hparams = create_hparams() + hparams = HParams() hparams.sampling_rate = TTS_SAMPLE_RATE self.model = Tacotron2(hparams) - tacotron2_path = cached_model_path("tacotron2_model") self.model.load_state_dict( torch.load(tacotron2_path, map_location="cpu")["state_dict"] ) self.model.eval() - waveglow_path = cached_model_path("waveglow_model") - self.waveglow = WaveGlow(**WAVEGLOW_CONFIG) wave_params = torch.load(waveglow_path, map_location="cpu") + self.waveglow = WaveGlow(**WAVEGLOW_CONFIG) self.waveglow.load_state_dict(wave_params) self.waveglow.eval() for k in self.waveglow.convinv: k.float() self.k_cache = klepto.archives.file_archive(cached=False) - self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)( - self.synth_speech - ) + self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(self.synth_speech) # workaround from # https://github.com/NVIDIA/waveglow/issues/127 for m in self.waveglow.modules(): if "Conv" in str(type(m)): setattr(m, "padding_mode", "zeros") - @do_time def synth_speech(self, t): text = t - sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[ - None, : - ] + sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :] sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long() - mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference( - sequence - ) + mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(sequence) with torch.no_grad(): audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666) audio = audio_t[0].data.cpu().numpy() @@ -130,10 +116,7 @@ def display(data): def player_gen(): audio_interface = pyaudio.PyAudio() _audio_stream = audio_interface.open( - format=pyaudio.paInt16, - channels=1, - rate=OUTPUT_SAMPLE_RATE, - output=True, + format=pyaudio.paInt16, channels=1, rate=OUTPUT_SAMPLE_RATE, output=True ) def play_device(data): @@ -144,14 +127,30 @@ def player_gen(): def synthesize_corpus(): - tts_model = TTSModel() + tts_model = TTSModel( + "/Users/malar/Work/tacotron2_statedict.pt", + "/Users/malar/Work/waveglow.pt", + ) all_data = [] for (i, line) in enumerate(open("corpus.txt").readlines()): - print('synthesizing... "{}"'.format(line.strip())) + print(f'synthesizing... "{line.strip()}"') data = tts_model.synth_speech(line.strip()) all_data.append(data) return all_data +def repl(): + tts_model = TTSModel( + "/Users/malar/Work/tacotron2_statedict.pt", + # "/Users/malar/Work/waveglow_256channels.pt", + "/Users/malar/Work/waveglow.pt", + ) + player = player_gen() + def loop(): + text = input('tts >') + data = tts_model.synth_speech(text.strip()) + player(data) + return loop + def play_corpus(corpus_synths): player = player_gen() @@ -160,11 +159,13 @@ def play_corpus(corpus_synths): def main(): - corpus_synth_data = synthesize_corpus() - play_corpus(corpus_synth_data) - import pdb - - pdb.set_trace() + # corpus_synth_data = synthesize_corpus() + # play_corpus(corpus_synth_data) + interactive_loop = repl() + while True: + interactive_loop() + # import pdb + # pdb.set_trace() if __name__ == "__main__": diff --git a/utils.py b/taco2/utils.py similarity index 100% rename from utils.py rename to taco2/utils.py