mirror of https://github.com/malarinv/tacotron2
packaged taco2
parent
1f83e8687c
commit
a10a6d517e
|
|
@ -0,0 +1,8 @@
|
||||||
|
=======
|
||||||
|
Credits
|
||||||
|
=======
|
||||||
|
|
||||||
|
Contributors
|
||||||
|
------------
|
||||||
|
|
||||||
|
* Malar Kannan <malarkannan.invention@gmail.com>
|
||||||
|
|
@ -0,0 +1,8 @@
|
||||||
|
=======
|
||||||
|
History
|
||||||
|
=======
|
||||||
|
|
||||||
|
0.1.0 (2019-09-20)
|
||||||
|
------------------
|
||||||
|
|
||||||
|
* First release on PyPI.
|
||||||
|
|
@ -0,0 +1,10 @@
|
||||||
|
include AUTHORS.rst
|
||||||
|
include HISTORY.rst
|
||||||
|
include LICENSE
|
||||||
|
include README.md
|
||||||
|
|
||||||
|
recursive-include tests *
|
||||||
|
recursive-exclude * __pycache__
|
||||||
|
recursive-exclude * *.py[co]
|
||||||
|
|
||||||
|
recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
|
||||||
|
|
@ -0,0 +1,34 @@
|
||||||
|
# Taco2 TTS
|
||||||
|
|
||||||
|
[](https://github.com/python/black)
|
||||||
|
|
||||||
|
> Generate speech audio from text
|
||||||
|
---
|
||||||
|
|
||||||
|
# Table of Contents
|
||||||
|
|
||||||
|
* [Features](#features)
|
||||||
|
* [Installation](#installation)
|
||||||
|
* [Usage](#usage)
|
||||||
|
|
||||||
|
# Features
|
||||||
|
|
||||||
|
* Tacotron2 Synthesized Speech
|
||||||
|
|
||||||
|
|
||||||
|
# Installation
|
||||||
|
Install the packages with for production use. It downloads the dependencies
|
||||||
|
```bash
|
||||||
|
python setup.py install
|
||||||
|
```
|
||||||
|
|
||||||
|
> Still facing an issue? Check the [Issues](#issues) section or open a new issue.
|
||||||
|
|
||||||
|
The installation should be smooth with Python 3.6 or newer.
|
||||||
|
|
||||||
|
# Usage
|
||||||
|
> API
|
||||||
|
```python
|
||||||
|
tts_model = TTSModel("/path/to/tacotron2_model","/path/to/waveglow_model")
|
||||||
|
SPEECH_AUDIO = tts_model.synth_speech(TEXT)
|
||||||
|
```
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
Hello world!
|
||||||
|
How have you been?
|
||||||
|
Today is a good day.
|
||||||
|
This seems to be working good
|
||||||
88
hparams.py
88
hparams.py
|
|
@ -1,88 +0,0 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
import tensorflow as tf
|
|
||||||
from .text import symbols
|
|
||||||
|
|
||||||
|
|
||||||
# changed path, sampling rate and batch size
|
|
||||||
def create_hparams(hparams_string=None, verbose=False):
|
|
||||||
"""Create model hyperparameters. Parse nondefault from given string."""
|
|
||||||
|
|
||||||
hparams = tf.contrib.training.HParams(
|
|
||||||
################################
|
|
||||||
# Experiment Parameters #
|
|
||||||
################################
|
|
||||||
epochs=500,
|
|
||||||
iters_per_checkpoint=1000,
|
|
||||||
seed=1234,
|
|
||||||
dynamic_loss_scaling=True,
|
|
||||||
fp16_run=False,
|
|
||||||
distributed_run=False,
|
|
||||||
dist_backend="nccl",
|
|
||||||
dist_url="tcp://localhost:54321",
|
|
||||||
cudnn_enabled=True,
|
|
||||||
cudnn_benchmark=False,
|
|
||||||
ignore_layers=["embedding.weight"],
|
|
||||||
################################
|
|
||||||
# Data Parameters #
|
|
||||||
################################
|
|
||||||
load_mel_from_disk=False,
|
|
||||||
training_files="lists/tts_data_train_processed.txt",
|
|
||||||
validation_files="filelists/tts_data_val_processed.txt",
|
|
||||||
text_cleaners=["english_cleaners"],
|
|
||||||
################################
|
|
||||||
# Audio Parameters #
|
|
||||||
################################
|
|
||||||
max_wav_value=32768.0,
|
|
||||||
sampling_rate=16000,
|
|
||||||
filter_length=1024,
|
|
||||||
hop_length=256,
|
|
||||||
win_length=1024,
|
|
||||||
n_mel_channels=80,
|
|
||||||
mel_fmin=0.0,
|
|
||||||
mel_fmax=8000.0,
|
|
||||||
################################
|
|
||||||
# Model Parameters #
|
|
||||||
################################
|
|
||||||
n_symbols=len(symbols),
|
|
||||||
symbols_embedding_dim=512,
|
|
||||||
# Encoder parameters
|
|
||||||
encoder_kernel_size=5,
|
|
||||||
encoder_n_convolutions=3,
|
|
||||||
encoder_embedding_dim=512,
|
|
||||||
# Decoder parameters
|
|
||||||
n_frames_per_step=1, # currently only 1 is supported
|
|
||||||
decoder_rnn_dim=1024,
|
|
||||||
prenet_dim=256,
|
|
||||||
max_decoder_steps=1000,
|
|
||||||
gate_threshold=0.5,
|
|
||||||
p_attention_dropout=0.1,
|
|
||||||
p_decoder_dropout=0.1,
|
|
||||||
# Attention parameters
|
|
||||||
attention_rnn_dim=1024,
|
|
||||||
attention_dim=128,
|
|
||||||
# Location Layer parameters
|
|
||||||
attention_location_n_filters=32,
|
|
||||||
attention_location_kernel_size=31,
|
|
||||||
# Mel-post processing network parameters
|
|
||||||
postnet_embedding_dim=512,
|
|
||||||
postnet_kernel_size=5,
|
|
||||||
postnet_n_convolutions=5,
|
|
||||||
################################
|
|
||||||
# Optimization Hyperparameters #
|
|
||||||
################################
|
|
||||||
use_saved_learning_rate=False,
|
|
||||||
learning_rate=1e-3,
|
|
||||||
weight_decay=1e-6,
|
|
||||||
grad_clip_thresh=1.0,
|
|
||||||
batch_size=4,
|
|
||||||
mask_padding=True, # set model's padded outputs to padded values
|
|
||||||
)
|
|
||||||
|
|
||||||
if hparams_string:
|
|
||||||
tf.logging.info("Parsing command line hparams: %s", hparams_string)
|
|
||||||
hparams.parse(hparams_string)
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
tf.logging.info("Final parsed hparams: %s", hparams.values())
|
|
||||||
|
|
||||||
return hparams
|
|
||||||
|
|
@ -0,0 +1,2 @@
|
||||||
|
numpy==1.17.2
|
||||||
|
torch==1.2.0
|
||||||
38
server.py
38
server.py
|
|
@ -1,38 +0,0 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
import grpc
|
|
||||||
import time
|
|
||||||
from sia.proto import tts_pb2
|
|
||||||
from sia.proto import tts_pb2_grpc
|
|
||||||
from concurrent import futures
|
|
||||||
from .tts import TTSModel
|
|
||||||
|
|
||||||
|
|
||||||
class TTSServer:
|
|
||||||
def __init__(self):
|
|
||||||
self.tts_model = TTSModel()
|
|
||||||
|
|
||||||
def TextToSpeechAPI(self, request, context):
|
|
||||||
while True:
|
|
||||||
input_text = request.text
|
|
||||||
speech_response = self.tts_model.synth_speech(input_text)
|
|
||||||
return tts_pb2.SpeechResponse(response=speech_response)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
|
|
||||||
tts_server = TTSServer()
|
|
||||||
tts_pb2_grpc.add_ServerServicer_to_server(tts_server, server)
|
|
||||||
server.add_insecure_port("localhost:50060")
|
|
||||||
server.start()
|
|
||||||
print("TTSServer started!")
|
|
||||||
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
time.sleep(10000)
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
server.start()
|
|
||||||
# server.stop(0)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
@ -0,0 +1,28 @@
|
||||||
|
[bumpversion]
|
||||||
|
current_version = 0.1.0
|
||||||
|
commit = True
|
||||||
|
tag = True
|
||||||
|
|
||||||
|
[bumpversion:file:setup.py]
|
||||||
|
search = version='{current_version}'
|
||||||
|
replace = version='{new_version}'
|
||||||
|
|
||||||
|
[bumpversion:file:taco2/__init__.py]
|
||||||
|
search = __version__ = '{current_version}'
|
||||||
|
replace = __version__ = '{new_version}'
|
||||||
|
|
||||||
|
[bdist_wheel]
|
||||||
|
universal = 1
|
||||||
|
|
||||||
|
[flake8]
|
||||||
|
exclude = docs
|
||||||
|
|
||||||
|
[aliases]
|
||||||
|
# Define setup.py command aliases here
|
||||||
|
test = pytest
|
||||||
|
|
||||||
|
[tool:pytest]
|
||||||
|
collect_ignore = ['setup.py']
|
||||||
|
|
||||||
|
[easy_install]
|
||||||
|
index-url = http://localhost:8080/simple
|
||||||
|
|
@ -0,0 +1,59 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""The setup script."""
|
||||||
|
|
||||||
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
|
with open("README.md") as readme_file:
|
||||||
|
readme = readme_file.read()
|
||||||
|
|
||||||
|
with open("HISTORY.rst") as history_file:
|
||||||
|
history = history_file.read()
|
||||||
|
|
||||||
|
requirements = [
|
||||||
|
"klepto==0.1.6",
|
||||||
|
"numpy==1.16.4",
|
||||||
|
"inflect==0.2.5",
|
||||||
|
"librosa==0.6.0",
|
||||||
|
"scipy==1.3.0",
|
||||||
|
"Unidecode==1.0.22",
|
||||||
|
"torch==1.1.0",
|
||||||
|
"PyAudio==0.2.11"
|
||||||
|
]
|
||||||
|
|
||||||
|
setup_requirements = ["pytest-runner"]
|
||||||
|
|
||||||
|
test_requirements = ["pytest"]
|
||||||
|
|
||||||
|
packages = find_packages()
|
||||||
|
|
||||||
|
setup(
|
||||||
|
author="Malar Kannan",
|
||||||
|
author_email="malar@agaralabs.com",
|
||||||
|
classifiers=[
|
||||||
|
"Development Status :: 2 - Pre-Alpha",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"Natural Language :: English",
|
||||||
|
"Programming Language :: Python :: 2",
|
||||||
|
"Programming Language :: Python :: 2.7",
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"Programming Language :: Python :: 3.4",
|
||||||
|
"Programming Language :: Python :: 3.5",
|
||||||
|
"Programming Language :: Python :: 3.6",
|
||||||
|
"Programming Language :: Python :: 3.7",
|
||||||
|
],
|
||||||
|
description="Taco2 TTS package.",
|
||||||
|
install_requires=requirements,
|
||||||
|
long_description=readme + "\n\n" + history,
|
||||||
|
include_package_data=True,
|
||||||
|
keywords="tacotron2",
|
||||||
|
name="taco2-tts",
|
||||||
|
packages=packages,
|
||||||
|
setup_requires=setup_requirements,
|
||||||
|
test_suite="tests",
|
||||||
|
tests_require=test_requirements,
|
||||||
|
url="https://github.com/malarinv/tacotron2",
|
||||||
|
version="0.1.0",
|
||||||
|
zip_safe=False,
|
||||||
|
entry_points={"console_scripts": ("tts_debug = tts:main",)},
|
||||||
|
)
|
||||||
|
|
@ -0,0 +1,76 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# import tensorflow as tf
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from .text import symbols
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class HParams(object):
|
||||||
|
"""docstring for HParams."""
|
||||||
|
################################
|
||||||
|
# Experiment Parameters #
|
||||||
|
################################
|
||||||
|
epochs=500
|
||||||
|
iters_per_checkpoint=1000
|
||||||
|
seed=1234
|
||||||
|
dynamic_loss_scaling=True
|
||||||
|
fp16_run=False
|
||||||
|
distributed_run=False
|
||||||
|
dist_backend="nccl"
|
||||||
|
dist_url="tcp://localhost:54321"
|
||||||
|
cudnn_enabled=True
|
||||||
|
cudnn_benchmark=False
|
||||||
|
ignore_layers=["embedding.weight"]
|
||||||
|
################################
|
||||||
|
# Data Parameters #
|
||||||
|
################################
|
||||||
|
load_mel_from_disk=False
|
||||||
|
training_files="lists/tts_data_train_processed.txt"
|
||||||
|
validation_files="filelists/tts_data_val_processed.txt"
|
||||||
|
text_cleaners=["english_cleaners"]
|
||||||
|
################################
|
||||||
|
# Audio Parameters #
|
||||||
|
################################
|
||||||
|
max_wav_value=32768.0
|
||||||
|
sampling_rate=16000
|
||||||
|
filter_length=1024
|
||||||
|
hop_length=256
|
||||||
|
win_length=1024
|
||||||
|
n_mel_channels=80
|
||||||
|
mel_fmin=0.0
|
||||||
|
mel_fmax=8000.0
|
||||||
|
################################
|
||||||
|
# Model Parameters #
|
||||||
|
################################
|
||||||
|
n_symbols=len(symbols)
|
||||||
|
symbols_embedding_dim=512
|
||||||
|
# Encoder parameters
|
||||||
|
encoder_kernel_size=5
|
||||||
|
encoder_n_convolutions=3
|
||||||
|
encoder_embedding_dim=512
|
||||||
|
# Decoder parameters
|
||||||
|
n_frames_per_step=1 # currently only 1 is supported
|
||||||
|
decoder_rnn_dim=1024
|
||||||
|
prenet_dim=256
|
||||||
|
max_decoder_steps=1000
|
||||||
|
gate_threshold=0.5
|
||||||
|
p_attention_dropout=0.1
|
||||||
|
p_decoder_dropout=0.1
|
||||||
|
# Attention parameters
|
||||||
|
attention_rnn_dim=1024
|
||||||
|
attention_dim=128
|
||||||
|
# Location Layer parameters
|
||||||
|
attention_location_n_filters=32
|
||||||
|
attention_location_kernel_size=31
|
||||||
|
# Mel-post processing network parameters
|
||||||
|
postnet_embedding_dim=512
|
||||||
|
postnet_kernel_size=5
|
||||||
|
postnet_n_convolutions=5
|
||||||
|
################################
|
||||||
|
# Optimization Hyperparameters #
|
||||||
|
################################
|
||||||
|
use_saved_learning_rate=False
|
||||||
|
learning_rate=1e-3
|
||||||
|
weight_decay=1e-6
|
||||||
|
grad_clip_thresh=1.0
|
||||||
|
batch_size=4
|
||||||
|
mask_padding=True # set model's padded outputs to padded values
|
||||||
|
|
@ -3,19 +3,14 @@
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from .hparams import create_hparams
|
|
||||||
from .text import text_to_sequence
|
|
||||||
from .glow import WaveGlow
|
|
||||||
|
|
||||||
# import os
|
|
||||||
# import soundfile as sf
|
|
||||||
import pyaudio
|
import pyaudio
|
||||||
import klepto
|
|
||||||
from librosa import resample
|
from librosa import resample
|
||||||
from librosa.effects import time_stretch
|
from librosa.effects import time_stretch
|
||||||
from sia.file_utils import cached_model_path
|
import klepto
|
||||||
from sia.instruments import do_time
|
|
||||||
from .model import Tacotron2
|
from .model import Tacotron2
|
||||||
|
from glow import WaveGlow
|
||||||
|
from .hparams import HParams
|
||||||
|
from .text import text_to_sequence
|
||||||
|
|
||||||
TTS_SAMPLE_RATE = 22050
|
TTS_SAMPLE_RATE = 22050
|
||||||
OUTPUT_SAMPLE_RATE = 16000
|
OUTPUT_SAMPLE_RATE = 16000
|
||||||
|
|
@ -35,43 +30,34 @@ WAVEGLOW_CONFIG = {
|
||||||
class TTSModel(object):
|
class TTSModel(object):
|
||||||
"""docstring for TTSModel."""
|
"""docstring for TTSModel."""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, tacotron2_path, waveglow_path):
|
||||||
super(TTSModel, self).__init__()
|
super(TTSModel, self).__init__()
|
||||||
hparams = create_hparams()
|
hparams = HParams()
|
||||||
hparams.sampling_rate = TTS_SAMPLE_RATE
|
hparams.sampling_rate = TTS_SAMPLE_RATE
|
||||||
self.model = Tacotron2(hparams)
|
self.model = Tacotron2(hparams)
|
||||||
tacotron2_path = cached_model_path("tacotron2_model")
|
|
||||||
self.model.load_state_dict(
|
self.model.load_state_dict(
|
||||||
torch.load(tacotron2_path, map_location="cpu")["state_dict"]
|
torch.load(tacotron2_path, map_location="cpu")["state_dict"]
|
||||||
)
|
)
|
||||||
self.model.eval()
|
self.model.eval()
|
||||||
waveglow_path = cached_model_path("waveglow_model")
|
|
||||||
self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
|
|
||||||
wave_params = torch.load(waveglow_path, map_location="cpu")
|
wave_params = torch.load(waveglow_path, map_location="cpu")
|
||||||
|
self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
|
||||||
self.waveglow.load_state_dict(wave_params)
|
self.waveglow.load_state_dict(wave_params)
|
||||||
self.waveglow.eval()
|
self.waveglow.eval()
|
||||||
for k in self.waveglow.convinv:
|
for k in self.waveglow.convinv:
|
||||||
k.float()
|
k.float()
|
||||||
self.k_cache = klepto.archives.file_archive(cached=False)
|
self.k_cache = klepto.archives.file_archive(cached=False)
|
||||||
self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
|
self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(self.synth_speech)
|
||||||
self.synth_speech
|
|
||||||
)
|
|
||||||
# workaround from
|
# workaround from
|
||||||
# https://github.com/NVIDIA/waveglow/issues/127
|
# https://github.com/NVIDIA/waveglow/issues/127
|
||||||
for m in self.waveglow.modules():
|
for m in self.waveglow.modules():
|
||||||
if "Conv" in str(type(m)):
|
if "Conv" in str(type(m)):
|
||||||
setattr(m, "padding_mode", "zeros")
|
setattr(m, "padding_mode", "zeros")
|
||||||
|
|
||||||
@do_time
|
|
||||||
def synth_speech(self, t):
|
def synth_speech(self, t):
|
||||||
text = t
|
text = t
|
||||||
sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[
|
sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :]
|
||||||
None, :
|
|
||||||
]
|
|
||||||
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
|
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
|
||||||
mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
|
mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(sequence)
|
||||||
sequence
|
|
||||||
)
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
|
audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
|
||||||
audio = audio_t[0].data.cpu().numpy()
|
audio = audio_t[0].data.cpu().numpy()
|
||||||
|
|
@ -130,10 +116,7 @@ def display(data):
|
||||||
def player_gen():
|
def player_gen():
|
||||||
audio_interface = pyaudio.PyAudio()
|
audio_interface = pyaudio.PyAudio()
|
||||||
_audio_stream = audio_interface.open(
|
_audio_stream = audio_interface.open(
|
||||||
format=pyaudio.paInt16,
|
format=pyaudio.paInt16, channels=1, rate=OUTPUT_SAMPLE_RATE, output=True
|
||||||
channels=1,
|
|
||||||
rate=OUTPUT_SAMPLE_RATE,
|
|
||||||
output=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def play_device(data):
|
def play_device(data):
|
||||||
|
|
@ -144,14 +127,30 @@ def player_gen():
|
||||||
|
|
||||||
|
|
||||||
def synthesize_corpus():
|
def synthesize_corpus():
|
||||||
tts_model = TTSModel()
|
tts_model = TTSModel(
|
||||||
|
"/Users/malar/Work/tacotron2_statedict.pt",
|
||||||
|
"/Users/malar/Work/waveglow.pt",
|
||||||
|
)
|
||||||
all_data = []
|
all_data = []
|
||||||
for (i, line) in enumerate(open("corpus.txt").readlines()):
|
for (i, line) in enumerate(open("corpus.txt").readlines()):
|
||||||
print('synthesizing... "{}"'.format(line.strip()))
|
print(f'synthesizing... "{line.strip()}"')
|
||||||
data = tts_model.synth_speech(line.strip())
|
data = tts_model.synth_speech(line.strip())
|
||||||
all_data.append(data)
|
all_data.append(data)
|
||||||
return all_data
|
return all_data
|
||||||
|
|
||||||
|
def repl():
|
||||||
|
tts_model = TTSModel(
|
||||||
|
"/Users/malar/Work/tacotron2_statedict.pt",
|
||||||
|
# "/Users/malar/Work/waveglow_256channels.pt",
|
||||||
|
"/Users/malar/Work/waveglow.pt",
|
||||||
|
)
|
||||||
|
player = player_gen()
|
||||||
|
def loop():
|
||||||
|
text = input('tts >')
|
||||||
|
data = tts_model.synth_speech(text.strip())
|
||||||
|
player(data)
|
||||||
|
return loop
|
||||||
|
|
||||||
|
|
||||||
def play_corpus(corpus_synths):
|
def play_corpus(corpus_synths):
|
||||||
player = player_gen()
|
player = player_gen()
|
||||||
|
|
@ -160,11 +159,13 @@ def play_corpus(corpus_synths):
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
corpus_synth_data = synthesize_corpus()
|
# corpus_synth_data = synthesize_corpus()
|
||||||
play_corpus(corpus_synth_data)
|
# play_corpus(corpus_synth_data)
|
||||||
import pdb
|
interactive_loop = repl()
|
||||||
|
while True:
|
||||||
pdb.set_trace()
|
interactive_loop()
|
||||||
|
# import pdb
|
||||||
|
# pdb.set_trace()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
Loading…
Reference in New Issue