packaged taco2

2026-03-07 17:32:33 +00:00 · 2019-09-21 01:19:30 +05:30
parent 1f83e8687c
commit a10a6d517e
26 changed files with 265 additions and 161 deletions
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -0,0 +1,8 @@
+=======
+Credits
+=======
+
+Contributors
+------------
+
+* Malar Kannan <malarkannan.invention@gmail.com>
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -0,0 +1,8 @@
+=======
+History
+=======
+
+0.1.0 (2019-09-20)
+------------------
+
+* First release on PyPI.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -0,0 +1,10 @@
+include AUTHORS.rst
+include HISTORY.rst
+include LICENSE
+include README.md
+
+recursive-include tests *
+recursive-exclude * __pycache__
+recursive-exclude * *.py[co]
+
+recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
--- a/README.md
+++ b/README.md
@@ -0,0 +1,34 @@
+# Taco2 TTS
+
+[![image](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black)
+
+> Generate speech audio from text
+---
+
+# Table of Contents
+
+* [Features](#features)
+* [Installation](#installation)
+* [Usage](#usage)
+
+# Features
+
+* Tacotron2 Synthesized Speech
+
+
+# Installation
+Install the packages with for production use. It downloads the dependencies
+```bash
+python setup.py install
+```
+
+> Still facing an issue? Check the [Issues](#issues) section or open a new issue.
+
+The installation should be smooth with Python 3.6 or newer.
+
+# Usage
+> API
+```python
+tts_model = TTSModel("/path/to/tacotron2_model","/path/to/waveglow_model")
+SPEECH_AUDIO = tts_model.synth_speech(TEXT)
+```
--- a/corpus.txt
+++ b/corpus.txt
@@ -0,0 +1,4 @@
+Hello world!
+How have you been?
+Today is a good day.
+This seems to be working good
--- a/hparams.py
+++ b/hparams.py
@@ -1,88 +0,0 @@
-# -*- coding: utf-8 -*-
-import tensorflow as tf
-from .text import symbols
-
-
-# changed path, sampling rate and batch size
-def create_hparams(hparams_string=None, verbose=False):
-    """Create model hyperparameters. Parse nondefault from given string."""
-
-    hparams = tf.contrib.training.HParams(
-        ################################
-        # Experiment Parameters        #
-        ################################
-        epochs=500,
-        iters_per_checkpoint=1000,
-        seed=1234,
-        dynamic_loss_scaling=True,
-        fp16_run=False,
-        distributed_run=False,
-        dist_backend="nccl",
-        dist_url="tcp://localhost:54321",
-        cudnn_enabled=True,
-        cudnn_benchmark=False,
-        ignore_layers=["embedding.weight"],
-        ################################
-        # Data Parameters             #
-        ################################
-        load_mel_from_disk=False,
-        training_files="lists/tts_data_train_processed.txt",
-        validation_files="filelists/tts_data_val_processed.txt",
-        text_cleaners=["english_cleaners"],
-        ################################
-        # Audio Parameters             #
-        ################################
-        max_wav_value=32768.0,
-        sampling_rate=16000,
-        filter_length=1024,
-        hop_length=256,
-        win_length=1024,
-        n_mel_channels=80,
-        mel_fmin=0.0,
-        mel_fmax=8000.0,
-        ################################
-        # Model Parameters             #
-        ################################
-        n_symbols=len(symbols),
-        symbols_embedding_dim=512,
-        # Encoder parameters
-        encoder_kernel_size=5,
-        encoder_n_convolutions=3,
-        encoder_embedding_dim=512,
-        # Decoder parameters
-        n_frames_per_step=1,  # currently only 1 is supported
-        decoder_rnn_dim=1024,
-        prenet_dim=256,
-        max_decoder_steps=1000,
-        gate_threshold=0.5,
-        p_attention_dropout=0.1,
-        p_decoder_dropout=0.1,
-        # Attention parameters
-        attention_rnn_dim=1024,
-        attention_dim=128,
-        # Location Layer parameters
-        attention_location_n_filters=32,
-        attention_location_kernel_size=31,
-        # Mel-post processing network parameters
-        postnet_embedding_dim=512,
-        postnet_kernel_size=5,
-        postnet_n_convolutions=5,
-        ################################
-        # Optimization Hyperparameters #
-        ################################
-        use_saved_learning_rate=False,
-        learning_rate=1e-3,
-        weight_decay=1e-6,
-        grad_clip_thresh=1.0,
-        batch_size=4,
-        mask_padding=True,  # set model's padded outputs to padded values
-    )
-
-    if hparams_string:
-        tf.logging.info("Parsing command line hparams: %s", hparams_string)
-        hparams.parse(hparams_string)
-
-    if verbose:
-        tf.logging.info("Final parsed hparams: %s", hparams.values())
-
-    return hparams
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+numpy==1.17.2
+torch==1.2.0
--- a/server.py
+++ b/server.py
@@ -1,38 +0,0 @@
-# -*- coding: utf-8 -*-
-import grpc
-import time
-from sia.proto import tts_pb2
-from sia.proto import tts_pb2_grpc
-from concurrent import futures
-from .tts import TTSModel
-
-
-class TTSServer:
-    def __init__(self):
-        self.tts_model = TTSModel()
-
-    def TextToSpeechAPI(self, request, context):
-        while True:
-            input_text = request.text
-            speech_response = self.tts_model.synth_speech(input_text)
-            return tts_pb2.SpeechResponse(response=speech_response)
-
-
-def main():
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
-    tts_server = TTSServer()
-    tts_pb2_grpc.add_ServerServicer_to_server(tts_server, server)
-    server.add_insecure_port("localhost:50060")
-    server.start()
-    print("TTSServer started!")
-
-    try:
-        while True:
-            time.sleep(10000)
-    except KeyboardInterrupt:
-        server.start()
-        # server.stop(0)
-
-
-if __name__ == "__main__":
-    main()
--- a/setup.cfg
+++ b/setup.cfg
@@ -0,0 +1,28 @@
+[bumpversion]
+current_version = 0.1.0
+commit = True
+tag = True
+
+[bumpversion:file:setup.py]
+search = version='{current_version}'
+replace = version='{new_version}'
+
+[bumpversion:file:taco2/__init__.py]
+search = __version__ = '{current_version}'
+replace = __version__ = '{new_version}'
+
+[bdist_wheel]
+universal = 1
+
+[flake8]
+exclude = docs
+
+[aliases]
+# Define setup.py command aliases here
+test = pytest
+
+[tool:pytest]
+collect_ignore = ['setup.py']
+
+[easy_install]
+index-url = http://localhost:8080/simple
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""The setup script."""
+
+from setuptools import setup, find_packages
+
+with open("README.md") as readme_file:
+    readme = readme_file.read()
+
+with open("HISTORY.rst") as history_file:
+    history = history_file.read()
+
+requirements = [
+    "klepto==0.1.6",
+    "numpy==1.16.4",
+    "inflect==0.2.5",
+    "librosa==0.6.0",
+    "scipy==1.3.0",
+    "Unidecode==1.0.22",
+    "torch==1.1.0",
+    "PyAudio==0.2.11"
+]
+
+setup_requirements = ["pytest-runner"]
+
+test_requirements = ["pytest"]
+
+packages = find_packages()
+
+setup(
+    author="Malar Kannan",
+    author_email="malar@agaralabs.com",
+    classifiers=[
+        "Development Status :: 2 - Pre-Alpha",
+        "Intended Audience :: Developers",
+        "Natural Language :: English",
+        "Programming Language :: Python :: 2",
+        "Programming Language :: Python :: 2.7",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.4",
+        "Programming Language :: Python :: 3.5",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+    ],
+    description="Taco2 TTS package.",
+    install_requires=requirements,
+    long_description=readme + "\n\n" + history,
+    include_package_data=True,
+    keywords="tacotron2",
+    name="taco2-tts",
+    packages=packages,
+    setup_requires=setup_requirements,
+    test_suite="tests",
+    tests_require=test_requirements,
+    url="https://github.com/malarinv/tacotron2",
+    version="0.1.0",
+    zip_safe=False,
+    entry_points={"console_scripts": ("tts_debug = tts:main",)},
+)
--- a/taco2/init.py
+++ b/taco2/init.py
--- a/taco2/audio_processing.py
+++ b/taco2/audio_processing.py
--- a/taco2/data_utils.py
+++ b/taco2/data_utils.py
--- a/taco2/hparams.py
+++ b/taco2/hparams.py
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+# import tensorflow as tf
+from dataclasses import dataclass
+from .text import symbols
+
+@dataclass
+class HParams(object):
+    """docstring for HParams."""
+    ################################
+    # Experiment Parameters        #
+    ################################
+    epochs=500
+    iters_per_checkpoint=1000
+    seed=1234
+    dynamic_loss_scaling=True
+    fp16_run=False
+    distributed_run=False
+    dist_backend="nccl"
+    dist_url="tcp://localhost:54321"
+    cudnn_enabled=True
+    cudnn_benchmark=False
+    ignore_layers=["embedding.weight"]
+    ################################
+    # Data Parameters             #
+    ################################
+    load_mel_from_disk=False
+    training_files="lists/tts_data_train_processed.txt"
+    validation_files="filelists/tts_data_val_processed.txt"
+    text_cleaners=["english_cleaners"]
+    ################################
+    # Audio Parameters             #
+    ################################
+    max_wav_value=32768.0
+    sampling_rate=16000
+    filter_length=1024
+    hop_length=256
+    win_length=1024
+    n_mel_channels=80
+    mel_fmin=0.0
+    mel_fmax=8000.0
+    ################################
+    # Model Parameters             #
+    ################################
+    n_symbols=len(symbols)
+    symbols_embedding_dim=512
+    # Encoder parameters
+    encoder_kernel_size=5
+    encoder_n_convolutions=3
+    encoder_embedding_dim=512
+    # Decoder parameters
+    n_frames_per_step=1  # currently only 1 is supported
+    decoder_rnn_dim=1024
+    prenet_dim=256
+    max_decoder_steps=1000
+    gate_threshold=0.5
+    p_attention_dropout=0.1
+    p_decoder_dropout=0.1
+    # Attention parameters
+    attention_rnn_dim=1024
+    attention_dim=128
+    # Location Layer parameters
+    attention_location_n_filters=32
+    attention_location_kernel_size=31
+    # Mel-post processing network parameters
+    postnet_embedding_dim=512
+    postnet_kernel_size=5
+    postnet_n_convolutions=5
+    ################################
+    # Optimization Hyperparameters #
+    ################################
+    use_saved_learning_rate=False
+    learning_rate=1e-3
+    weight_decay=1e-6
+    grad_clip_thresh=1.0
+    batch_size=4
+    mask_padding=True  # set model's padded outputs to padded values
--- a/taco2/layers.py
+++ b/taco2/layers.py
--- a/taco2/loss_function.py
+++ b/taco2/loss_function.py
--- a/taco2/model.py
+++ b/taco2/model.py
--- a/taco2/stft.py
+++ b/taco2/stft.py
--- a/taco2/text/LICENSE
+++ b/taco2/text/LICENSE
--- a/taco2/text/init.py
+++ b/taco2/text/init.py
--- a/taco2/text/cleaners.py
+++ b/taco2/text/cleaners.py
--- a/taco2/text/cmudict.py
+++ b/taco2/text/cmudict.py
--- a/taco2/text/numbers.py
+++ b/taco2/text/numbers.py
--- a/taco2/text/symbols.py
+++ b/taco2/text/symbols.py
--- a/taco2/tts.py
+++ b/taco2/tts.py
@@ -3,19 +3,14 @@

 import numpy as np
 import torch
-from .hparams import create_hparams
-from .text import text_to_sequence
-from .glow import WaveGlow
-
-# import os
-# import soundfile as sf
 import pyaudio
-import klepto
 from librosa import resample
 from librosa.effects import time_stretch
-from sia.file_utils import cached_model_path
-from sia.instruments import do_time
+import klepto
 from .model import Tacotron2
+from glow import WaveGlow
+from .hparams import HParams
+from .text import text_to_sequence

 TTS_SAMPLE_RATE = 22050
 OUTPUT_SAMPLE_RATE = 16000
@@ -35,43 +30,34 @@ WAVEGLOW_CONFIG = {
 class TTSModel(object):
    """docstring for TTSModel."""

-    def __init__(self):
+    def __init__(self, tacotron2_path, waveglow_path):
        super(TTSModel, self).__init__()
-        hparams = create_hparams()
+        hparams = HParams()
        hparams.sampling_rate = TTS_SAMPLE_RATE
        self.model = Tacotron2(hparams)
-        tacotron2_path = cached_model_path("tacotron2_model")
        self.model.load_state_dict(
            torch.load(tacotron2_path, map_location="cpu")["state_dict"]
        )
        self.model.eval()
-        waveglow_path = cached_model_path("waveglow_model")
-        self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
        wave_params = torch.load(waveglow_path, map_location="cpu")
+        self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
        self.waveglow.load_state_dict(wave_params)
        self.waveglow.eval()
        for k in self.waveglow.convinv:
            k.float()
        self.k_cache = klepto.archives.file_archive(cached=False)
-        self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
-            self.synth_speech
-        )
+        self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(self.synth_speech)
        # workaround from
        # https://github.com/NVIDIA/waveglow/issues/127
        for m in self.waveglow.modules():
            if "Conv" in str(type(m)):
                setattr(m, "padding_mode", "zeros")

-    @do_time
    def synth_speech(self, t):
        text = t
-        sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[
-            None, :
-        ]
+        sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :]
        sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
-        mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
-            sequence
-        )
+        mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(sequence)
        with torch.no_grad():
            audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
        audio = audio_t[0].data.cpu().numpy()
@@ -130,10 +116,7 @@ def display(data):
 def player_gen():
    audio_interface = pyaudio.PyAudio()
    _audio_stream = audio_interface.open(
-        format=pyaudio.paInt16,
-        channels=1,
-        rate=OUTPUT_SAMPLE_RATE,
-        output=True,
+        format=pyaudio.paInt16, channels=1, rate=OUTPUT_SAMPLE_RATE, output=True
    )

    def play_device(data):
@@ -144,14 +127,30 @@ def player_gen():


 def synthesize_corpus():
-    tts_model = TTSModel()
+    tts_model = TTSModel(
+        "/Users/malar/Work/tacotron2_statedict.pt",
+        "/Users/malar/Work/waveglow.pt",
+    )
    all_data = []
    for (i, line) in enumerate(open("corpus.txt").readlines()):
-        print('synthesizing... "{}"'.format(line.strip()))
+        print(f'synthesizing... "{line.strip()}"')
        data = tts_model.synth_speech(line.strip())
        all_data.append(data)
    return all_data

+def repl():
+    tts_model = TTSModel(
+        "/Users/malar/Work/tacotron2_statedict.pt",
+        # "/Users/malar/Work/waveglow_256channels.pt",
+        "/Users/malar/Work/waveglow.pt",
+    )
+    player = player_gen()
+    def loop():
+        text = input('tts >')
+        data = tts_model.synth_speech(text.strip())
+        player(data)
+    return loop
+

 def play_corpus(corpus_synths):
    player = player_gen()
@@ -160,11 +159,13 @@ def play_corpus(corpus_synths):


 def main():
-    corpus_synth_data = synthesize_corpus()
-    play_corpus(corpus_synth_data)
-    import pdb
-
-    pdb.set_trace()
+    # corpus_synth_data = synthesize_corpus()
+    # play_corpus(corpus_synth_data)
+    interactive_loop = repl()
+    while True:
+        interactive_loop()
+    # import pdb
+    # pdb.set_trace()


 if __name__ == "__main__":
--- a/taco2/utils.py
+++ b/taco2/utils.py