1
0
mirror of https://github.com/malarinv/tacotron2 synced 2026-03-08 01:32:35 +00:00

5 Commits

Author SHA1 Message Date
6d3679d760 relax scipy version 2021-07-02 23:21:45 +05:30
a851e80db2 1. added rpyc server optional package
2. updated path variable
2020-03-05 16:59:55 +05:30
cb0c8ddd06 1. make pyaudio as extra requirement
2. warn if pyaudio not installed if player_gen is used
2020-01-22 14:09:37 +05:30
42a85d177e added glow as module 2019-11-29 00:20:01 +05:30
5efb1e2758 1. fix synth_speech_fast interface
2. rename private methods
2019-11-28 18:11:47 +05:30
6 changed files with 135 additions and 32 deletions

View File

@@ -15,12 +15,19 @@ requirements = [
"numpy~=1.16.4", "numpy~=1.16.4",
"inflect==0.2.5", "inflect==0.2.5",
"librosa==0.6.0", "librosa==0.6.0",
"scipy~=1.3.0", "scipy~=1.3",
"Unidecode==1.0.22", "Unidecode==1.0.22",
"torch~=1.1.0", "torch~=1.1.0",
"PyAudio==0.2.11"
] ]
extra_requirements = {
"playback": ["PyAudio==0.2.11"],
"server": [
"google-cloud-texttospeech==1.0.1",
"rpyc==4.1.4",
],
}
setup_requirements = ["pytest-runner"] setup_requirements = ["pytest-runner"]
test_requirements = ["pytest"] test_requirements = ["pytest"]
@@ -44,6 +51,7 @@ setup(
], ],
description="Taco2 TTS package.", description="Taco2 TTS package.",
install_requires=requirements, install_requires=requirements,
extras_require=extra_requirements,
long_description=readme + "\n\n" + history, long_description=readme + "\n\n" + history,
include_package_data=True, include_package_data=True,
keywords="tacotron2 tts", keywords="tacotron2 tts",
@@ -55,5 +63,10 @@ setup(
url="https://github.com/malarinv/tacotron2", url="https://github.com/malarinv/tacotron2",
version="0.3.0", version="0.3.0",
zip_safe=False, zip_safe=False,
entry_points={"console_scripts": ("tts_debug = taco2.tts:main",)}, entry_points={
"console_scripts": (
"tts_debug = taco2.tts:main",
"tts_rpyc_server = taco2.server.__main__:main",
)
},
) )

0
taco2/server/__init__.py Normal file
View File

48
taco2/server/__main__.py Normal file
View File

@@ -0,0 +1,48 @@
import os
import logging
import rpyc
from rpyc.utils.server import ThreadedServer
from .backend import TTSSynthesizer
tts_backend = os.environ.get("TTS_BACKEND", "taco2")
tts_synthesizer = TTSSynthesizer(backend=tts_backend)
class TTSService(rpyc.Service):
def on_connect(self, conn):
# code that runs when a connection is created
# (to init the service, if needed)
pass
def on_disconnect(self, conn):
# code that runs after the connection has already closed
# (to finalize the service, if needed)
pass
def exposed_synth_speech(self, utterance: str): # this is an exposed method
speech_audio = tts_synthesizer.synth_speech(utterance)
return speech_audio
def exposed_synth_speech_cb(
self, utterance: str, respond
): # this is an exposed method
speech_audio = tts_synthesizer.synth_speech(utterance)
respond(speech_audio)
def main():
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
port = int(os.environ.get("TTS_RPYC_PORT", "7754"))
logging.info("starting tts server...")
t = ThreadedServer(TTSService, port=port)
t.start()
if __name__ == "__main__":
main()

45
taco2/server/backend.py Normal file
View File

@@ -0,0 +1,45 @@
import os
from google.cloud import texttospeech
from ..tts import TTSModel
tts_model_weights = os.environ.get(
"TTS_MODELS", "models/tacotron2_statedict.pt,models/waveglow_256channels.pt"
)
tts_creds = os.environ.get(
"GOOGLE_APPLICATION_CREDENTIALS", "/code/config/gre2e/keys/gre2e_gcp.json"
)
taco2, wav_glow = tts_model_weights.split(",", 1)
class TTSSynthesizer(object):
"""docstring for TTSSynthesizer."""
def __init__(self, backend="taco2"):
super(TTSSynthesizer, self).__init__()
if backend == "taco2":
tts_model = TTSModel(f"{taco2}", f"{wav_glow}") # Loads the models
self.synth_speech = tts_model.synth_speech
elif backend == "gcp":
client = texttospeech.TextToSpeechClient()
# Build the voice request, select the language code ("en-US") and the ssml
# voice gender ("neutral")
voice = texttospeech.types.VoiceSelectionParams(language_code="en-US")
# Select the type of audio file you want returned
audio_config = texttospeech.types.AudioConfig(
audio_encoding=texttospeech.enums.AudioEncoding.LINEAR16
)
# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
def gcp_synthesize(speech_text):
synthesis_input = texttospeech.types.SynthesisInput(text=speech_text)
response = client.synthesize_speech(
synthesis_input, voice, audio_config
)
return response.audio_content
self.synth_speech = gcp_synthesize

View File

@@ -3,9 +3,9 @@
import numpy as np import numpy as np
import torch import torch
import pyaudio
import klepto import klepto
import argparse import argparse
import warnings
from pathlib import Path from pathlib import Path
from .model import Tacotron2 from .model import Tacotron2
from glow import WaveGlow from glow import WaveGlow
@@ -17,7 +17,7 @@ from .audio_processing import griffin_lim, postprocess_audio
OUTPUT_SAMPLE_RATE = 22050 OUTPUT_SAMPLE_RATE = 22050
GL_ITERS = 30 GL_ITERS = 30
VOCODER_MODEL = "wavglow" VOCODER_WAVEGLOW, VOCODER_GL = "wavglow", "gl"
# config from # config from
# https://github.com/NVIDIA/waveglow/blob/master/config.json # https://github.com/NVIDIA/waveglow/blob/master/config.json
@@ -74,11 +74,11 @@ class TTSModel(object):
self.waveglow, n_mel_channels=hparams.n_mel_channels self.waveglow, n_mel_channels=hparams.n_mel_channels
) )
self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)( self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
self.synth_speech self._synth_speech
) )
else: else:
self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)( self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
self.synth_speech_fast self._synth_speech_fast
) )
self.taco_stft = TacotronSTFT( self.taco_stft = TacotronSTFT(
hparams.filter_length, hparams.filter_length,
@@ -89,7 +89,7 @@ class TTSModel(object):
mel_fmax=4000, mel_fmax=4000,
) )
def generate_mel_postnet(self, text): def _generate_mel_postnet(self, text):
sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :] sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :]
if torch.cuda.is_available(): if torch.cuda.is_available():
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
@@ -102,14 +102,14 @@ class TTSModel(object):
return mel_outputs_postnet return mel_outputs_postnet
def synth_speech_array(self, text, vocoder): def synth_speech_array(self, text, vocoder):
mel_outputs_postnet = self.generate_mel_postnet(text) mel_outputs_postnet = self._generate_mel_postnet(text)
if vocoder == "wavglow": if vocoder == VOCODER_WAVEGLOW:
with torch.no_grad(): with torch.no_grad():
audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666) audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
audio_t = self.denoiser(audio_t, 0.1)[0] audio_t = self.denoiser(audio_t, 0.1)[0]
audio = audio_t[0].data audio = audio_t[0].data
elif vocoder == "gl": elif vocoder == VOCODER_GL:
mel_decompress = self.taco_stft.spectral_de_normalize(mel_outputs_postnet) mel_decompress = self.taco_stft.spectral_de_normalize(mel_outputs_postnet)
mel_decompress = mel_decompress.transpose(1, 2).data.cpu() mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
spec_from_mel_scaling = 1000 spec_from_mel_scaling = 1000
@@ -122,7 +122,7 @@ class TTSModel(object):
audio = griffin_lim( audio = griffin_lim(
torch.autograd.Variable(spec_from_mel[:, :, :-1]), torch.autograd.Variable(spec_from_mel[:, :, :-1]),
self.taco_stft.stft_fn, self.taco_stft.stft_fn,
60, GL_ITERS,
) )
audio = audio.squeeze() audio = audio.squeeze()
else: else:
@@ -130,40 +130,37 @@ class TTSModel(object):
audio = audio.cpu().numpy() audio = audio.cpu().numpy()
return audio return audio
def synth_speech( def _synth_speech(
self, text, speed: float = 1.0, sample_rate: int = OUTPUT_SAMPLE_RATE self, text, speed: float = 1.0, sample_rate: int = OUTPUT_SAMPLE_RATE
): ):
audio = self.synth_speech_array(text, VOCODER_MODEL) audio = self.synth_speech_array(text, VOCODER_WAVEGLOW)
return postprocess_audio( return postprocess_audio(
audio, src_rate=self.hparams.sampling_rate, dst_rate=sample_rate, tempo=speed audio,
src_rate=self.hparams.sampling_rate,
dst_rate=sample_rate,
tempo=speed,
) )
def synth_speech_fast( def _synth_speech_fast(
self, text, speed: float = 1.0, sample_rate: int = OUTPUT_SAMPLE_RATE self, text, speed: float = 1.0, sample_rate: int = OUTPUT_SAMPLE_RATE
): ):
mel_outputs_postnet = self.generate_mel_postnet(text) audio = self.synth_speech_array(text, VOCODER_GL)
mel_decompress = self.taco_stft.spectral_de_normalize(mel_outputs_postnet)
mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
spec_from_mel_scaling = 1000
spec_from_mel = torch.mm(mel_decompress[0], self.taco_stft.mel_basis)
spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
spec_from_mel = spec_from_mel * spec_from_mel_scaling
audio = griffin_lim(
torch.autograd.Variable(spec_from_mel[:, :, :-1]),
self.taco_stft.stft_fn,
GL_ITERS,
)
audio = audio.squeeze()
audio = audio.cpu().numpy()
return postprocess_audio( return postprocess_audio(
audio, tempo=speed, src_rate=self.hparams.sampling_rate, dst_rate=sample_rate audio,
tempo=speed,
src_rate=self.hparams.sampling_rate,
dst_rate=sample_rate,
) )
def player_gen(): def player_gen():
try:
import pyaudio
except ModuleNotFoundError:
warnings.warn("module 'pyaudio' is not installed requried for playback")
return
audio_interface = pyaudio.PyAudio() audio_interface = pyaudio.PyAudio()
_audio_stream = audio_interface.open( _audio_stream = audio_interface.open(
format=pyaudio.paInt16, channels=1, rate=OUTPUT_SAMPLE_RATE, output=True format=pyaudio.paInt16, channels=1, rate=OUTPUT_SAMPLE_RATE, output=True