mirror of https://github.com/malarinv/tacotron2
packaged taco2
parent
1f83e8687c
commit
a10a6d517e
|
|
@ -0,0 +1,8 @@
|
|||
=======
|
||||
Credits
|
||||
=======
|
||||
|
||||
Contributors
|
||||
------------
|
||||
|
||||
* Malar Kannan <malarkannan.invention@gmail.com>
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
=======
|
||||
History
|
||||
=======
|
||||
|
||||
0.1.0 (2019-09-20)
|
||||
------------------
|
||||
|
||||
* First release on PyPI.
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
include AUTHORS.rst
|
||||
include HISTORY.rst
|
||||
include LICENSE
|
||||
include README.md
|
||||
|
||||
recursive-include tests *
|
||||
recursive-exclude * __pycache__
|
||||
recursive-exclude * *.py[co]
|
||||
|
||||
recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
# Taco2 TTS
|
||||
|
||||
[](https://github.com/python/black)
|
||||
|
||||
> Generate speech audio from text
|
||||
---
|
||||
|
||||
# Table of Contents
|
||||
|
||||
* [Features](#features)
|
||||
* [Installation](#installation)
|
||||
* [Usage](#usage)
|
||||
|
||||
# Features
|
||||
|
||||
* Tacotron2 Synthesized Speech
|
||||
|
||||
|
||||
# Installation
|
||||
Install the packages with for production use. It downloads the dependencies
|
||||
```bash
|
||||
python setup.py install
|
||||
```
|
||||
|
||||
> Still facing an issue? Check the [Issues](#issues) section or open a new issue.
|
||||
|
||||
The installation should be smooth with Python 3.6 or newer.
|
||||
|
||||
# Usage
|
||||
> API
|
||||
```python
|
||||
tts_model = TTSModel("/path/to/tacotron2_model","/path/to/waveglow_model")
|
||||
SPEECH_AUDIO = tts_model.synth_speech(TEXT)
|
||||
```
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
Hello world!
|
||||
How have you been?
|
||||
Today is a good day.
|
||||
This seems to be working good
|
||||
88
hparams.py
88
hparams.py
|
|
@ -1,88 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import tensorflow as tf
|
||||
from .text import symbols
|
||||
|
||||
|
||||
# changed path, sampling rate and batch size
|
||||
def create_hparams(hparams_string=None, verbose=False):
|
||||
"""Create model hyperparameters. Parse nondefault from given string."""
|
||||
|
||||
hparams = tf.contrib.training.HParams(
|
||||
################################
|
||||
# Experiment Parameters #
|
||||
################################
|
||||
epochs=500,
|
||||
iters_per_checkpoint=1000,
|
||||
seed=1234,
|
||||
dynamic_loss_scaling=True,
|
||||
fp16_run=False,
|
||||
distributed_run=False,
|
||||
dist_backend="nccl",
|
||||
dist_url="tcp://localhost:54321",
|
||||
cudnn_enabled=True,
|
||||
cudnn_benchmark=False,
|
||||
ignore_layers=["embedding.weight"],
|
||||
################################
|
||||
# Data Parameters #
|
||||
################################
|
||||
load_mel_from_disk=False,
|
||||
training_files="lists/tts_data_train_processed.txt",
|
||||
validation_files="filelists/tts_data_val_processed.txt",
|
||||
text_cleaners=["english_cleaners"],
|
||||
################################
|
||||
# Audio Parameters #
|
||||
################################
|
||||
max_wav_value=32768.0,
|
||||
sampling_rate=16000,
|
||||
filter_length=1024,
|
||||
hop_length=256,
|
||||
win_length=1024,
|
||||
n_mel_channels=80,
|
||||
mel_fmin=0.0,
|
||||
mel_fmax=8000.0,
|
||||
################################
|
||||
# Model Parameters #
|
||||
################################
|
||||
n_symbols=len(symbols),
|
||||
symbols_embedding_dim=512,
|
||||
# Encoder parameters
|
||||
encoder_kernel_size=5,
|
||||
encoder_n_convolutions=3,
|
||||
encoder_embedding_dim=512,
|
||||
# Decoder parameters
|
||||
n_frames_per_step=1, # currently only 1 is supported
|
||||
decoder_rnn_dim=1024,
|
||||
prenet_dim=256,
|
||||
max_decoder_steps=1000,
|
||||
gate_threshold=0.5,
|
||||
p_attention_dropout=0.1,
|
||||
p_decoder_dropout=0.1,
|
||||
# Attention parameters
|
||||
attention_rnn_dim=1024,
|
||||
attention_dim=128,
|
||||
# Location Layer parameters
|
||||
attention_location_n_filters=32,
|
||||
attention_location_kernel_size=31,
|
||||
# Mel-post processing network parameters
|
||||
postnet_embedding_dim=512,
|
||||
postnet_kernel_size=5,
|
||||
postnet_n_convolutions=5,
|
||||
################################
|
||||
# Optimization Hyperparameters #
|
||||
################################
|
||||
use_saved_learning_rate=False,
|
||||
learning_rate=1e-3,
|
||||
weight_decay=1e-6,
|
||||
grad_clip_thresh=1.0,
|
||||
batch_size=4,
|
||||
mask_padding=True, # set model's padded outputs to padded values
|
||||
)
|
||||
|
||||
if hparams_string:
|
||||
tf.logging.info("Parsing command line hparams: %s", hparams_string)
|
||||
hparams.parse(hparams_string)
|
||||
|
||||
if verbose:
|
||||
tf.logging.info("Final parsed hparams: %s", hparams.values())
|
||||
|
||||
return hparams
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
numpy==1.17.2
|
||||
torch==1.2.0
|
||||
38
server.py
38
server.py
|
|
@ -1,38 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import grpc
|
||||
import time
|
||||
from sia.proto import tts_pb2
|
||||
from sia.proto import tts_pb2_grpc
|
||||
from concurrent import futures
|
||||
from .tts import TTSModel
|
||||
|
||||
|
||||
class TTSServer:
|
||||
def __init__(self):
|
||||
self.tts_model = TTSModel()
|
||||
|
||||
def TextToSpeechAPI(self, request, context):
|
||||
while True:
|
||||
input_text = request.text
|
||||
speech_response = self.tts_model.synth_speech(input_text)
|
||||
return tts_pb2.SpeechResponse(response=speech_response)
|
||||
|
||||
|
||||
def main():
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
|
||||
tts_server = TTSServer()
|
||||
tts_pb2_grpc.add_ServerServicer_to_server(tts_server, server)
|
||||
server.add_insecure_port("localhost:50060")
|
||||
server.start()
|
||||
print("TTSServer started!")
|
||||
|
||||
try:
|
||||
while True:
|
||||
time.sleep(10000)
|
||||
except KeyboardInterrupt:
|
||||
server.start()
|
||||
# server.stop(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
[bumpversion]
|
||||
current_version = 0.1.0
|
||||
commit = True
|
||||
tag = True
|
||||
|
||||
[bumpversion:file:setup.py]
|
||||
search = version='{current_version}'
|
||||
replace = version='{new_version}'
|
||||
|
||||
[bumpversion:file:taco2/__init__.py]
|
||||
search = __version__ = '{current_version}'
|
||||
replace = __version__ = '{new_version}'
|
||||
|
||||
[bdist_wheel]
|
||||
universal = 1
|
||||
|
||||
[flake8]
|
||||
exclude = docs
|
||||
|
||||
[aliases]
|
||||
# Define setup.py command aliases here
|
||||
test = pytest
|
||||
|
||||
[tool:pytest]
|
||||
collect_ignore = ['setup.py']
|
||||
|
||||
[easy_install]
|
||||
index-url = http://localhost:8080/simple
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""The setup script."""
|
||||
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
with open("README.md") as readme_file:
|
||||
readme = readme_file.read()
|
||||
|
||||
with open("HISTORY.rst") as history_file:
|
||||
history = history_file.read()
|
||||
|
||||
requirements = [
|
||||
"klepto==0.1.6",
|
||||
"numpy==1.16.4",
|
||||
"inflect==0.2.5",
|
||||
"librosa==0.6.0",
|
||||
"scipy==1.3.0",
|
||||
"Unidecode==1.0.22",
|
||||
"torch==1.1.0",
|
||||
"PyAudio==0.2.11"
|
||||
]
|
||||
|
||||
setup_requirements = ["pytest-runner"]
|
||||
|
||||
test_requirements = ["pytest"]
|
||||
|
||||
packages = find_packages()
|
||||
|
||||
setup(
|
||||
author="Malar Kannan",
|
||||
author_email="malar@agaralabs.com",
|
||||
classifiers=[
|
||||
"Development Status :: 2 - Pre-Alpha",
|
||||
"Intended Audience :: Developers",
|
||||
"Natural Language :: English",
|
||||
"Programming Language :: Python :: 2",
|
||||
"Programming Language :: Python :: 2.7",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.4",
|
||||
"Programming Language :: Python :: 3.5",
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
],
|
||||
description="Taco2 TTS package.",
|
||||
install_requires=requirements,
|
||||
long_description=readme + "\n\n" + history,
|
||||
include_package_data=True,
|
||||
keywords="tacotron2",
|
||||
name="taco2-tts",
|
||||
packages=packages,
|
||||
setup_requires=setup_requirements,
|
||||
test_suite="tests",
|
||||
tests_require=test_requirements,
|
||||
url="https://github.com/malarinv/tacotron2",
|
||||
version="0.1.0",
|
||||
zip_safe=False,
|
||||
entry_points={"console_scripts": ("tts_debug = tts:main",)},
|
||||
)
|
||||
|
|
@ -0,0 +1,76 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# import tensorflow as tf
|
||||
from dataclasses import dataclass
|
||||
from .text import symbols
|
||||
|
||||
@dataclass
|
||||
class HParams(object):
|
||||
"""docstring for HParams."""
|
||||
################################
|
||||
# Experiment Parameters #
|
||||
################################
|
||||
epochs=500
|
||||
iters_per_checkpoint=1000
|
||||
seed=1234
|
||||
dynamic_loss_scaling=True
|
||||
fp16_run=False
|
||||
distributed_run=False
|
||||
dist_backend="nccl"
|
||||
dist_url="tcp://localhost:54321"
|
||||
cudnn_enabled=True
|
||||
cudnn_benchmark=False
|
||||
ignore_layers=["embedding.weight"]
|
||||
################################
|
||||
# Data Parameters #
|
||||
################################
|
||||
load_mel_from_disk=False
|
||||
training_files="lists/tts_data_train_processed.txt"
|
||||
validation_files="filelists/tts_data_val_processed.txt"
|
||||
text_cleaners=["english_cleaners"]
|
||||
################################
|
||||
# Audio Parameters #
|
||||
################################
|
||||
max_wav_value=32768.0
|
||||
sampling_rate=16000
|
||||
filter_length=1024
|
||||
hop_length=256
|
||||
win_length=1024
|
||||
n_mel_channels=80
|
||||
mel_fmin=0.0
|
||||
mel_fmax=8000.0
|
||||
################################
|
||||
# Model Parameters #
|
||||
################################
|
||||
n_symbols=len(symbols)
|
||||
symbols_embedding_dim=512
|
||||
# Encoder parameters
|
||||
encoder_kernel_size=5
|
||||
encoder_n_convolutions=3
|
||||
encoder_embedding_dim=512
|
||||
# Decoder parameters
|
||||
n_frames_per_step=1 # currently only 1 is supported
|
||||
decoder_rnn_dim=1024
|
||||
prenet_dim=256
|
||||
max_decoder_steps=1000
|
||||
gate_threshold=0.5
|
||||
p_attention_dropout=0.1
|
||||
p_decoder_dropout=0.1
|
||||
# Attention parameters
|
||||
attention_rnn_dim=1024
|
||||
attention_dim=128
|
||||
# Location Layer parameters
|
||||
attention_location_n_filters=32
|
||||
attention_location_kernel_size=31
|
||||
# Mel-post processing network parameters
|
||||
postnet_embedding_dim=512
|
||||
postnet_kernel_size=5
|
||||
postnet_n_convolutions=5
|
||||
################################
|
||||
# Optimization Hyperparameters #
|
||||
################################
|
||||
use_saved_learning_rate=False
|
||||
learning_rate=1e-3
|
||||
weight_decay=1e-6
|
||||
grad_clip_thresh=1.0
|
||||
batch_size=4
|
||||
mask_padding=True # set model's padded outputs to padded values
|
||||
|
|
@ -3,19 +3,14 @@
|
|||
|
||||
import numpy as np
|
||||
import torch
|
||||
from .hparams import create_hparams
|
||||
from .text import text_to_sequence
|
||||
from .glow import WaveGlow
|
||||
|
||||
# import os
|
||||
# import soundfile as sf
|
||||
import pyaudio
|
||||
import klepto
|
||||
from librosa import resample
|
||||
from librosa.effects import time_stretch
|
||||
from sia.file_utils import cached_model_path
|
||||
from sia.instruments import do_time
|
||||
import klepto
|
||||
from .model import Tacotron2
|
||||
from glow import WaveGlow
|
||||
from .hparams import HParams
|
||||
from .text import text_to_sequence
|
||||
|
||||
TTS_SAMPLE_RATE = 22050
|
||||
OUTPUT_SAMPLE_RATE = 16000
|
||||
|
|
@ -35,43 +30,34 @@ WAVEGLOW_CONFIG = {
|
|||
class TTSModel(object):
|
||||
"""docstring for TTSModel."""
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, tacotron2_path, waveglow_path):
|
||||
super(TTSModel, self).__init__()
|
||||
hparams = create_hparams()
|
||||
hparams = HParams()
|
||||
hparams.sampling_rate = TTS_SAMPLE_RATE
|
||||
self.model = Tacotron2(hparams)
|
||||
tacotron2_path = cached_model_path("tacotron2_model")
|
||||
self.model.load_state_dict(
|
||||
torch.load(tacotron2_path, map_location="cpu")["state_dict"]
|
||||
)
|
||||
self.model.eval()
|
||||
waveglow_path = cached_model_path("waveglow_model")
|
||||
self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
|
||||
wave_params = torch.load(waveglow_path, map_location="cpu")
|
||||
self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
|
||||
self.waveglow.load_state_dict(wave_params)
|
||||
self.waveglow.eval()
|
||||
for k in self.waveglow.convinv:
|
||||
k.float()
|
||||
self.k_cache = klepto.archives.file_archive(cached=False)
|
||||
self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
|
||||
self.synth_speech
|
||||
)
|
||||
self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(self.synth_speech)
|
||||
# workaround from
|
||||
# https://github.com/NVIDIA/waveglow/issues/127
|
||||
for m in self.waveglow.modules():
|
||||
if "Conv" in str(type(m)):
|
||||
setattr(m, "padding_mode", "zeros")
|
||||
|
||||
@do_time
|
||||
def synth_speech(self, t):
|
||||
text = t
|
||||
sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[
|
||||
None, :
|
||||
]
|
||||
sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :]
|
||||
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
|
||||
mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
|
||||
sequence
|
||||
)
|
||||
mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(sequence)
|
||||
with torch.no_grad():
|
||||
audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
|
||||
audio = audio_t[0].data.cpu().numpy()
|
||||
|
|
@ -130,10 +116,7 @@ def display(data):
|
|||
def player_gen():
|
||||
audio_interface = pyaudio.PyAudio()
|
||||
_audio_stream = audio_interface.open(
|
||||
format=pyaudio.paInt16,
|
||||
channels=1,
|
||||
rate=OUTPUT_SAMPLE_RATE,
|
||||
output=True,
|
||||
format=pyaudio.paInt16, channels=1, rate=OUTPUT_SAMPLE_RATE, output=True
|
||||
)
|
||||
|
||||
def play_device(data):
|
||||
|
|
@ -144,14 +127,30 @@ def player_gen():
|
|||
|
||||
|
||||
def synthesize_corpus():
|
||||
tts_model = TTSModel()
|
||||
tts_model = TTSModel(
|
||||
"/Users/malar/Work/tacotron2_statedict.pt",
|
||||
"/Users/malar/Work/waveglow.pt",
|
||||
)
|
||||
all_data = []
|
||||
for (i, line) in enumerate(open("corpus.txt").readlines()):
|
||||
print('synthesizing... "{}"'.format(line.strip()))
|
||||
print(f'synthesizing... "{line.strip()}"')
|
||||
data = tts_model.synth_speech(line.strip())
|
||||
all_data.append(data)
|
||||
return all_data
|
||||
|
||||
def repl():
|
||||
tts_model = TTSModel(
|
||||
"/Users/malar/Work/tacotron2_statedict.pt",
|
||||
# "/Users/malar/Work/waveglow_256channels.pt",
|
||||
"/Users/malar/Work/waveglow.pt",
|
||||
)
|
||||
player = player_gen()
|
||||
def loop():
|
||||
text = input('tts >')
|
||||
data = tts_model.synth_speech(text.strip())
|
||||
player(data)
|
||||
return loop
|
||||
|
||||
|
||||
def play_corpus(corpus_synths):
|
||||
player = player_gen()
|
||||
|
|
@ -160,11 +159,13 @@ def play_corpus(corpus_synths):
|
|||
|
||||
|
||||
def main():
|
||||
corpus_synth_data = synthesize_corpus()
|
||||
play_corpus(corpus_synth_data)
|
||||
import pdb
|
||||
|
||||
pdb.set_trace()
|
||||
# corpus_synth_data = synthesize_corpus()
|
||||
# play_corpus(corpus_synth_data)
|
||||
interactive_loop = repl()
|
||||
while True:
|
||||
interactive_loop()
|
||||
# import pdb
|
||||
# pdb.set_trace()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
Loading…
Reference in New Issue