packaged taco2

2026-03-08 01:32:35 +00:00 · 2019-09-21 01:19:30 +05:30
parent 1f83e8687c
commit a10a6d517e
26 changed files with 265 additions and 161 deletions
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -0,0 +1,8 @@
 =======
 Credits
 =======
 Contributors
 ------------
 * Malar Kannan <malarkannan.invention@gmail.com>
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -0,0 +1,8 @@
 =======
 History
 =======
 0.1.0 (2019-09-20)
 ------------------
 * First release on PyPI.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -0,0 +1,10 @@
 include AUTHORS.rst
 include HISTORY.rst
 include LICENSE
 include README.md
 recursive-include tests *
 recursive-exclude * __pycache__
 recursive-exclude * *.py[co]
 recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
--- a/README.md
+++ b/README.md
@@ -0,0 +1,34 @@
 # Taco2 TTS
 [![image](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black)
 > Generate speech audio from text
 ---
 # Table of Contents
 * [Features](#features)
 * [Installation](#installation)
 * [Usage](#usage)
 # Features
 * Tacotron2 Synthesized Speech
 # Installation
 Install the packages with for production use. It downloads the dependencies
 ```bash
 python setup.py install
 ```
 > Still facing an issue? Check the [Issues](#issues) section or open a new issue.
 The installation should be smooth with Python 3.6 or newer.
 # Usage
 > API
 ```python
 tts_model = TTSModel("/path/to/tacotron2_model","/path/to/waveglow_model")
 SPEECH_AUDIO = tts_model.synth_speech(TEXT)
 ```
--- a/corpus.txt
+++ b/corpus.txt
@@ -0,0 +1,4 @@
 Hello world!
 How have you been?
 Today is a good day.
 This seems to be working good
--- a/hparams.py
+++ b/hparams.py
@@ -1,88 +0,0 @@
 # -*- coding: utf-8 -*-
 import tensorflow as tf
 from .text import symbols
 # changed path, sampling rate and batch size
 def create_hparams(hparams_string=None, verbose=False):
    """Create model hyperparameters. Parse nondefault from given string."""
    hparams = tf.contrib.training.HParams(
        ################################
        # Experiment Parameters        #
        ################################
        epochs=500,
        iters_per_checkpoint=1000,
        seed=1234,
        dynamic_loss_scaling=True,
        fp16_run=False,
        distributed_run=False,
        dist_backend="nccl",
        dist_url="tcp://localhost:54321",
        cudnn_enabled=True,
        cudnn_benchmark=False,
        ignore_layers=["embedding.weight"],
        ################################
        # Data Parameters             #
        ################################
        load_mel_from_disk=False,
        training_files="lists/tts_data_train_processed.txt",
        validation_files="filelists/tts_data_val_processed.txt",
        text_cleaners=["english_cleaners"],
        ################################
        # Audio Parameters             #
        ################################
        max_wav_value=32768.0,
        sampling_rate=16000,
        filter_length=1024,
        hop_length=256,
        win_length=1024,
        n_mel_channels=80,
        mel_fmin=0.0,
        mel_fmax=8000.0,
        ################################
        # Model Parameters             #
        ################################
        n_symbols=len(symbols),
        symbols_embedding_dim=512,
        # Encoder parameters
        encoder_kernel_size=5,
        encoder_n_convolutions=3,
        encoder_embedding_dim=512,
        # Decoder parameters
        n_frames_per_step=1,  # currently only 1 is supported
        decoder_rnn_dim=1024,
        prenet_dim=256,
        max_decoder_steps=1000,
        gate_threshold=0.5,
        p_attention_dropout=0.1,
        p_decoder_dropout=0.1,
        # Attention parameters
        attention_rnn_dim=1024,
        attention_dim=128,
        # Location Layer parameters
        attention_location_n_filters=32,
        attention_location_kernel_size=31,
        # Mel-post processing network parameters
        postnet_embedding_dim=512,
        postnet_kernel_size=5,
        postnet_n_convolutions=5,
        ################################
        # Optimization Hyperparameters #
        ################################
        use_saved_learning_rate=False,
        learning_rate=1e-3,
        weight_decay=1e-6,
        grad_clip_thresh=1.0,
        batch_size=4,
        mask_padding=True,  # set model's padded outputs to padded values
    )
    if hparams_string:
        tf.logging.info("Parsing command line hparams: %s", hparams_string)
        hparams.parse(hparams_string)
    if verbose:
        tf.logging.info("Final parsed hparams: %s", hparams.values())
    return hparams
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,2 @@
 numpy==1.17.2
 torch==1.2.0
--- a/server.py
+++ b/server.py
@@ -1,38 +0,0 @@
 # -*- coding: utf-8 -*-
 import grpc
 import time
 from sia.proto import tts_pb2
 from sia.proto import tts_pb2_grpc
 from concurrent import futures
 from .tts import TTSModel
 class TTSServer:
    def __init__(self):
        self.tts_model = TTSModel()
    def TextToSpeechAPI(self, request, context):
        while True:
            input_text = request.text
            speech_response = self.tts_model.synth_speech(input_text)
            return tts_pb2.SpeechResponse(response=speech_response)
 def main():
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
    tts_server = TTSServer()
    tts_pb2_grpc.add_ServerServicer_to_server(tts_server, server)
    server.add_insecure_port("localhost:50060")
    server.start()
    print("TTSServer started!")
    try:
        while True:
            time.sleep(10000)
    except KeyboardInterrupt:
        server.start()
        # server.stop(0)
 if __name__ == "__main__":
    main()
--- a/setup.cfg
+++ b/setup.cfg
@@ -0,0 +1,28 @@
 [bumpversion]
 current_version = 0.1.0
 commit = True
 tag = True
 [bumpversion:file:setup.py]
 search = version='{current_version}'
 replace = version='{new_version}'
 [bumpversion:file:taco2/__init__.py]
 search = __version__ = '{current_version}'
 replace = __version__ = '{new_version}'
 [bdist_wheel]
 universal = 1
 [flake8]
 exclude = docs
 [aliases]
 # Define setup.py command aliases here
 test = pytest
 [tool:pytest]
 collect_ignore = ['setup.py']
 [easy_install]
 index-url = http://localhost:8080/simple
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,59 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 """The setup script."""
 from setuptools import setup, find_packages
 with open("README.md") as readme_file:
    readme = readme_file.read()
 with open("HISTORY.rst") as history_file:
    history = history_file.read()
 requirements = [
    "klepto==0.1.6",
    "numpy==1.16.4",
    "inflect==0.2.5",
    "librosa==0.6.0",
    "scipy==1.3.0",
    "Unidecode==1.0.22",
    "torch==1.1.0",
    "PyAudio==0.2.11"
 ]
 setup_requirements = ["pytest-runner"]
 test_requirements = ["pytest"]
 packages = find_packages()
 setup(
    author="Malar Kannan",
    author_email="malar@agaralabs.com",
    classifiers=[
        "Development Status :: 2 - Pre-Alpha",
        "Intended Audience :: Developers",
        "Natural Language :: English",
        "Programming Language :: Python :: 2",
        "Programming Language :: Python :: 2.7",
        "Programming Language :: Python :: 3",
        "Programming Language :: Python :: 3.4",
        "Programming Language :: Python :: 3.5",
        "Programming Language :: Python :: 3.6",
        "Programming Language :: Python :: 3.7",
    ],
    description="Taco2 TTS package.",
    install_requires=requirements,
    long_description=readme + "\n\n" + history,
    include_package_data=True,
    keywords="tacotron2",
    name="taco2-tts",
    packages=packages,
    setup_requires=setup_requirements,
    test_suite="tests",
    tests_require=test_requirements,
    url="https://github.com/malarinv/tacotron2",
    version="0.1.0",
    zip_safe=False,
    entry_points={"console_scripts": ("tts_debug = tts:main",)},
 )
--- a/taco2/init.py
+++ b/taco2/init.py
--- a/taco2/audio_processing.py
+++ b/taco2/audio_processing.py
--- a/taco2/data_utils.py
+++ b/taco2/data_utils.py
--- a/taco2/hparams.py
+++ b/taco2/hparams.py
@@ -0,0 +1,76 @@
 # -*- coding: utf-8 -*-
 # import tensorflow as tf
 from dataclasses import dataclass
 from .text import symbols
@dataclass
 class HParams(object):
    """docstring for HParams."""
    ################################
    # Experiment Parameters        #
    ################################
    epochs=500
    iters_per_checkpoint=1000
    seed=1234
    dynamic_loss_scaling=True
    fp16_run=False
    distributed_run=False
    dist_backend="nccl"
    dist_url="tcp://localhost:54321"
    cudnn_enabled=True
    cudnn_benchmark=False
    ignore_layers=["embedding.weight"]
    ################################
    # Data Parameters             #
    ################################
    load_mel_from_disk=False
    training_files="lists/tts_data_train_processed.txt"
    validation_files="filelists/tts_data_val_processed.txt"
    text_cleaners=["english_cleaners"]
    ################################
    # Audio Parameters             #
    ################################
    max_wav_value=32768.0
    sampling_rate=16000
    filter_length=1024
    hop_length=256
    win_length=1024
    n_mel_channels=80
    mel_fmin=0.0
    mel_fmax=8000.0
    ################################
    # Model Parameters             #
    ################################
    n_symbols=len(symbols)
    symbols_embedding_dim=512
    # Encoder parameters
    encoder_kernel_size=5
    encoder_n_convolutions=3
    encoder_embedding_dim=512
    # Decoder parameters
    n_frames_per_step=1  # currently only 1 is supported
    decoder_rnn_dim=1024
    prenet_dim=256
    max_decoder_steps=1000
    gate_threshold=0.5
    p_attention_dropout=0.1
    p_decoder_dropout=0.1
    # Attention parameters
    attention_rnn_dim=1024
    attention_dim=128
    # Location Layer parameters
    attention_location_n_filters=32
    attention_location_kernel_size=31
    # Mel-post processing network parameters
    postnet_embedding_dim=512
    postnet_kernel_size=5
    postnet_n_convolutions=5
    ################################
    # Optimization Hyperparameters #
    ################################
    use_saved_learning_rate=False
    learning_rate=1e-3
    weight_decay=1e-6
    grad_clip_thresh=1.0
    batch_size=4
    mask_padding=True  # set model's padded outputs to padded values
--- a/taco2/layers.py
+++ b/taco2/layers.py
--- a/taco2/loss_function.py
+++ b/taco2/loss_function.py
--- a/taco2/model.py
+++ b/taco2/model.py
--- a/taco2/stft.py
+++ b/taco2/stft.py
--- a/taco2/text/LICENSE
+++ b/taco2/text/LICENSE
--- a/taco2/text/init.py
+++ b/taco2/text/init.py
--- a/taco2/text/cleaners.py
+++ b/taco2/text/cleaners.py
--- a/taco2/text/cmudict.py
+++ b/taco2/text/cmudict.py
--- a/taco2/text/numbers.py
+++ b/taco2/text/numbers.py
--- a/taco2/text/symbols.py
+++ b/taco2/text/symbols.py
--- a/taco2/tts.py
+++ b/taco2/tts.py
@@ -3,19 +3,14 @@
 import numpy as np
 import torch
 from .hparams import create_hparams
 from .text import text_to_sequence
 from .glow import WaveGlow
 # import os
 # import soundfile as sf
 import pyaudio
 import klepto
 from librosa import resample
 from librosa.effects import time_stretch
-from sia.file_utils import cached_model_path
+import klepto
 from sia.instruments import do_time
 from .model import Tacotron2
 from glow import WaveGlow
 from .hparams import HParams
 from .text import text_to_sequence
 TTS_SAMPLE_RATE = 22050
 OUTPUT_SAMPLE_RATE = 16000
@@ -35,43 +30,34 @@ WAVEGLOW_CONFIG = {
 class TTSModel(object):
    """docstring for TTSModel."""
-    def __init__(self):
+    def __init__(self, tacotron2_path, waveglow_path):
        super(TTSModel, self).__init__()
-        hparams = create_hparams()
+        hparams = HParams()
        hparams.sampling_rate = TTS_SAMPLE_RATE
        self.model = Tacotron2(hparams)
        tacotron2_path = cached_model_path("tacotron2_model")
        self.model.load_state_dict(
            torch.load(tacotron2_path, map_location="cpu")["state_dict"]
        )
        self.model.eval()
        waveglow_path = cached_model_path("waveglow_model")
        self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
        wave_params = torch.load(waveglow_path, map_location="cpu")
        self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
        self.waveglow.load_state_dict(wave_params)
        self.waveglow.eval()
        for k in self.waveglow.convinv:
            k.float()
        self.k_cache = klepto.archives.file_archive(cached=False)
-        self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
+        self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(self.synth_speech)
            self.synth_speech
        )
        # workaround from
        # https://github.com/NVIDIA/waveglow/issues/127
        for m in self.waveglow.modules():
            if "Conv" in str(type(m)):
                setattr(m, "padding_mode", "zeros")
    @do_time
    def synth_speech(self, t):
        text = t
-        sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[
+        sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :]
            None, :
        ]
        sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
-        mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
+        mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(sequence)
            sequence
        )
        with torch.no_grad():
            audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
        audio = audio_t[0].data.cpu().numpy()
@@ -130,10 +116,7 @@ def display(data):
 def player_gen():
    audio_interface = pyaudio.PyAudio()
    _audio_stream = audio_interface.open(
-        format=pyaudio.paInt16,
+        format=pyaudio.paInt16, channels=1, rate=OUTPUT_SAMPLE_RATE, output=True
        channels=1,
        rate=OUTPUT_SAMPLE_RATE,
        output=True,
    )
    def play_device(data):
@@ -144,14 +127,30 @@ def player_gen():
 def synthesize_corpus():
-    tts_model = TTSModel()
+    tts_model = TTSModel(
        "/Users/malar/Work/tacotron2_statedict.pt",
        "/Users/malar/Work/waveglow.pt",
    )
    all_data = []
    for (i, line) in enumerate(open("corpus.txt").readlines()):
-        print('synthesizing... "{}"'.format(line.strip()))
+        print(f'synthesizing... "{line.strip()}"')
        data = tts_model.synth_speech(line.strip())
        all_data.append(data)
    return all_data
 def repl():
    tts_model = TTSModel(
        "/Users/malar/Work/tacotron2_statedict.pt",
        # "/Users/malar/Work/waveglow_256channels.pt",
        "/Users/malar/Work/waveglow.pt",
    )
    player = player_gen()
    def loop():
        text = input('tts >')
        data = tts_model.synth_speech(text.strip())
        player(data)
    return loop
 def play_corpus(corpus_synths):
    player = player_gen()
@@ -160,11 +159,13 @@ def play_corpus(corpus_synths):
 def main():
-    corpus_synth_data = synthesize_corpus()
+    # corpus_synth_data = synthesize_corpus()
-    play_corpus(corpus_synth_data)
+    # play_corpus(corpus_synth_data)
-    import pdb
+    interactive_loop = repl()
-
+    while True:
-    pdb.set_trace()
+        interactive_loop()
    # import pdb
    # pdb.set_trace()
 if __name__ == "__main__":
--- a/taco2/utils.py
+++ b/taco2/utils.py