1
0
mirror of https://github.com/malarinv/tacotron2 synced 2026-03-08 09:42:34 +00:00

8 Commits

Author SHA1 Message Date
6d3679d760 relax scipy version 2021-07-02 23:21:45 +05:30
a851e80db2 1. added rpyc server optional package
2. updated path variable
2020-03-05 16:59:55 +05:30
cb0c8ddd06 1. make pyaudio as extra requirement
2. warn if pyaudio not installed if player_gen is used
2020-01-22 14:09:37 +05:30
42a85d177e added glow as module 2019-11-29 00:20:01 +05:30
5efb1e2758 1. fix synth_speech_fast interface
2. rename private methods
2019-11-28 18:11:47 +05:30
ea11c5199e tested gl/wavglow working 2019-11-28 18:00:52 +05:30
78eed2d295 bugfix for vocoder arg 2019-11-28 17:03:46 +05:30
009b87e716 update dependency version and add speed/sample rate and vocoder args 2019-11-28 16:57:36 +05:30
8 changed files with 172 additions and 46 deletions

View File

@@ -12,15 +12,22 @@ with open("HISTORY.rst") as history_file:
requirements = [ requirements = [
"klepto==0.1.6", "klepto==0.1.6",
"numpy==1.16.4", "numpy~=1.16.4",
"inflect==0.2.5", "inflect==0.2.5",
"librosa==0.6.0", "librosa==0.6.0",
"scipy==1.3.0", "scipy~=1.3",
"Unidecode==1.0.22", "Unidecode==1.0.22",
"torch==1.1.0", "torch~=1.1.0",
"PyAudio==0.2.11"
] ]
extra_requirements = {
"playback": ["PyAudio==0.2.11"],
"server": [
"google-cloud-texttospeech==1.0.1",
"rpyc==4.1.4",
],
}
setup_requirements = ["pytest-runner"] setup_requirements = ["pytest-runner"]
test_requirements = ["pytest"] test_requirements = ["pytest"]
@@ -44,6 +51,7 @@ setup(
], ],
description="Taco2 TTS package.", description="Taco2 TTS package.",
install_requires=requirements, install_requires=requirements,
extras_require=extra_requirements,
long_description=readme + "\n\n" + history, long_description=readme + "\n\n" + history,
include_package_data=True, include_package_data=True,
keywords="tacotron2 tts", keywords="tacotron2 tts",
@@ -53,7 +61,12 @@ setup(
test_suite="tests", test_suite="tests",
tests_require=test_requirements, tests_require=test_requirements,
url="https://github.com/malarinv/tacotron2", url="https://github.com/malarinv/tacotron2",
version="0.2.0", version="0.3.0",
zip_safe=False, zip_safe=False,
entry_points={"console_scripts": ("tts_debug = taco2.tts:main",)}, entry_points={
"console_scripts": (
"tts_debug = taco2.tts:main",
"tts_rpyc_server = taco2.server.__main__:main",
)
},
) )

View File

@@ -35,7 +35,7 @@ class HParams(object):
# Audio Parameters # # Audio Parameters #
################################ ################################
max_wav_value = 32768.0 max_wav_value = 32768.0
sampling_rate = 16000 sampling_rate = 22050
filter_length = 1024 filter_length = 1024
hop_length = 256 hop_length = 256
win_length = 1024 win_length = 1024

0
taco2/server/__init__.py Normal file
View File

48
taco2/server/__main__.py Normal file
View File

@@ -0,0 +1,48 @@
import os
import logging
import rpyc
from rpyc.utils.server import ThreadedServer
from .backend import TTSSynthesizer
tts_backend = os.environ.get("TTS_BACKEND", "taco2")
tts_synthesizer = TTSSynthesizer(backend=tts_backend)
class TTSService(rpyc.Service):
def on_connect(self, conn):
# code that runs when a connection is created
# (to init the service, if needed)
pass
def on_disconnect(self, conn):
# code that runs after the connection has already closed
# (to finalize the service, if needed)
pass
def exposed_synth_speech(self, utterance: str): # this is an exposed method
speech_audio = tts_synthesizer.synth_speech(utterance)
return speech_audio
def exposed_synth_speech_cb(
self, utterance: str, respond
): # this is an exposed method
speech_audio = tts_synthesizer.synth_speech(utterance)
respond(speech_audio)
def main():
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
port = int(os.environ.get("TTS_RPYC_PORT", "7754"))
logging.info("starting tts server...")
t = ThreadedServer(TTSService, port=port)
t.start()
if __name__ == "__main__":
main()

45
taco2/server/backend.py Normal file
View File

@@ -0,0 +1,45 @@
import os
from google.cloud import texttospeech
from ..tts import TTSModel
tts_model_weights = os.environ.get(
"TTS_MODELS", "models/tacotron2_statedict.pt,models/waveglow_256channels.pt"
)
tts_creds = os.environ.get(
"GOOGLE_APPLICATION_CREDENTIALS", "/code/config/gre2e/keys/gre2e_gcp.json"
)
taco2, wav_glow = tts_model_weights.split(",", 1)
class TTSSynthesizer(object):
"""docstring for TTSSynthesizer."""
def __init__(self, backend="taco2"):
super(TTSSynthesizer, self).__init__()
if backend == "taco2":
tts_model = TTSModel(f"{taco2}", f"{wav_glow}") # Loads the models
self.synth_speech = tts_model.synth_speech
elif backend == "gcp":
client = texttospeech.TextToSpeechClient()
# Build the voice request, select the language code ("en-US") and the ssml
# voice gender ("neutral")
voice = texttospeech.types.VoiceSelectionParams(language_code="en-US")
# Select the type of audio file you want returned
audio_config = texttospeech.types.AudioConfig(
audio_encoding=texttospeech.enums.AudioEncoding.LINEAR16
)
# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
def gcp_synthesize(speech_text):
synthesis_input = texttospeech.types.SynthesisInput(text=speech_text)
response = client.synthesize_speech(
synthesis_input, voice, audio_config
)
return response.audio_content
self.synth_speech = gcp_synthesize

View File

@@ -84,8 +84,8 @@ class STFT(torch.nn.Module):
forward_basis *= fft_window forward_basis *= fft_window
inverse_basis *= fft_window inverse_basis *= fft_window
self.register_buffer("forward_basis", forward_basis.float()) self.register_buffer("forward_basis", forward_basis.float().to(DEVICE))
self.register_buffer("inverse_basis", inverse_basis.float()) self.register_buffer("inverse_basis", inverse_basis.float().to(DEVICE))
def transform(self, input_data): def transform(self, input_data):
num_batches = input_data.size(0) num_batches = input_data.size(0)
@@ -121,10 +121,10 @@ class STFT(torch.nn.Module):
return magnitude, phase return magnitude, phase
def inverse(self, magnitude, phase): def inverse(self, magnitude, phase):
phase = phase.to(DEVICE)
recombine_magnitude_phase = torch.cat( recombine_magnitude_phase = torch.cat(
[magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1 [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
) )
inverse_transform = F.conv_transpose1d( inverse_transform = F.conv_transpose1d(
recombine_magnitude_phase, recombine_magnitude_phase,
Variable(self.inverse_basis, requires_grad=False), Variable(self.inverse_basis, requires_grad=False),
@@ -144,11 +144,10 @@ class STFT(torch.nn.Module):
# remove modulation effects # remove modulation effects
approx_nonzero_indices = torch.from_numpy( approx_nonzero_indices = torch.from_numpy(
np.where(window_sum > tiny(window_sum))[0] np.where(window_sum > tiny(window_sum))[0]
) ).to(DEVICE)
window_sum = torch.autograd.Variable( window_sum = torch.autograd.Variable(
torch.from_numpy(window_sum), requires_grad=False torch.from_numpy(window_sum), requires_grad=False
) ).to(DEVICE)
window_sum = window_sum.to(DEVICE)
inverse_transform[:, :, approx_nonzero_indices] /= window_sum[ inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
approx_nonzero_indices approx_nonzero_indices
] ]

View File

@@ -3,9 +3,9 @@
import numpy as np import numpy as np
import torch import torch
import pyaudio
import klepto import klepto
import argparse import argparse
import warnings
from pathlib import Path from pathlib import Path
from .model import Tacotron2 from .model import Tacotron2
from glow import WaveGlow from glow import WaveGlow
@@ -15,9 +15,9 @@ from .text import text_to_sequence
from .denoiser import Denoiser from .denoiser import Denoiser
from .audio_processing import griffin_lim, postprocess_audio from .audio_processing import griffin_lim, postprocess_audio
TTS_SAMPLE_RATE = 22050
OUTPUT_SAMPLE_RATE = 22050 OUTPUT_SAMPLE_RATE = 22050
# OUTPUT_SAMPLE_RATE = 16000 GL_ITERS = 30
VOCODER_WAVEGLOW, VOCODER_GL = "wavglow", "gl"
# config from # config from
# https://github.com/NVIDIA/waveglow/blob/master/config.json # https://github.com/NVIDIA/waveglow/blob/master/config.json
@@ -37,7 +37,7 @@ class TTSModel(object):
def __init__(self, tacotron2_path, waveglow_path, **kwargs): def __init__(self, tacotron2_path, waveglow_path, **kwargs):
super(TTSModel, self).__init__() super(TTSModel, self).__init__()
hparams = HParams(**kwargs) hparams = HParams(**kwargs)
hparams.sampling_rate = TTS_SAMPLE_RATE self.hparams = hparams
self.model = Tacotron2(hparams) self.model = Tacotron2(hparams)
if torch.cuda.is_available(): if torch.cuda.is_available():
self.model.load_state_dict(torch.load(tacotron2_path)["state_dict"]) self.model.load_state_dict(torch.load(tacotron2_path)["state_dict"])
@@ -74,11 +74,11 @@ class TTSModel(object):
self.waveglow, n_mel_channels=hparams.n_mel_channels self.waveglow, n_mel_channels=hparams.n_mel_channels
) )
self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)( self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
self.synth_speech self._synth_speech
) )
else: else:
self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)( self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
self.synth_speech_gl self._synth_speech_fast
) )
self.taco_stft = TacotronSTFT( self.taco_stft = TacotronSTFT(
hparams.filter_length, hparams.filter_length,
@@ -89,7 +89,7 @@ class TTSModel(object):
mel_fmax=4000, mel_fmax=4000,
) )
def generate_mel_postnet(self, text): def _generate_mel_postnet(self, text):
sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :] sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :]
if torch.cuda.is_available(): if torch.cuda.is_available():
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
@@ -101,45 +101,66 @@ class TTSModel(object):
) )
return mel_outputs_postnet return mel_outputs_postnet
def synth_speech_array(self, text): def synth_speech_array(self, text, vocoder):
mel_outputs_postnet = self.generate_mel_postnet(text) mel_outputs_postnet = self._generate_mel_postnet(text)
with torch.no_grad(): if vocoder == VOCODER_WAVEGLOW:
audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666) with torch.no_grad():
audio_t = self.denoiser(audio_t, 0.1)[0] audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
audio = audio_t[0].data.cpu().numpy() audio_t = self.denoiser(audio_t, 0.1)[0]
audio = audio_t[0].data
elif vocoder == VOCODER_GL:
mel_decompress = self.taco_stft.spectral_de_normalize(mel_outputs_postnet)
mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
spec_from_mel_scaling = 1000
spec_from_mel = torch.mm(mel_decompress[0], self.taco_stft.mel_basis)
spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
spec_from_mel = spec_from_mel * spec_from_mel_scaling
spec_from_mel = (
spec_from_mel.cuda() if torch.cuda.is_available() else spec_from_mel
)
audio = griffin_lim(
torch.autograd.Variable(spec_from_mel[:, :, :-1]),
self.taco_stft.stft_fn,
GL_ITERS,
)
audio = audio.squeeze()
else:
raise ValueError("vocoder arg should be one of [wavglow|gl]")
audio = audio.cpu().numpy()
return audio return audio
def synth_speech(self, text): def _synth_speech(
audio = self.synth_speech_array(text) self, text, speed: float = 1.0, sample_rate: int = OUTPUT_SAMPLE_RATE
):
audio = self.synth_speech_array(text, VOCODER_WAVEGLOW)
return postprocess_audio( return postprocess_audio(
audio, src_rate=TTS_SAMPLE_RATE, dst_rate=OUTPUT_SAMPLE_RATE audio,
src_rate=self.hparams.sampling_rate,
dst_rate=sample_rate,
tempo=speed,
) )
def synth_speech_gl(self, text, griffin_iters=60): def _synth_speech_fast(
mel_outputs_postnet = self.generate_mel_postnet(text) self, text, speed: float = 1.0, sample_rate: int = OUTPUT_SAMPLE_RATE
):
mel_decompress = self.taco_stft.spectral_de_normalize(mel_outputs_postnet) audio = self.synth_speech_array(text, VOCODER_GL)
mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
spec_from_mel_scaling = 1000
spec_from_mel = torch.mm(mel_decompress[0], self.taco_stft.mel_basis)
spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
spec_from_mel = spec_from_mel * spec_from_mel_scaling
audio = griffin_lim(
torch.autograd.Variable(spec_from_mel[:, :, :-1]),
self.taco_stft.stft_fn,
griffin_iters,
)
audio = audio.squeeze()
audio = audio.cpu().numpy()
return postprocess_audio( return postprocess_audio(
audio, tempo=0.6, src_rate=TTS_SAMPLE_RATE, dst_rate=OUTPUT_SAMPLE_RATE audio,
tempo=speed,
src_rate=self.hparams.sampling_rate,
dst_rate=sample_rate,
) )
def player_gen(): def player_gen():
try:
import pyaudio
except ModuleNotFoundError:
warnings.warn("module 'pyaudio' is not installed requried for playback")
return
audio_interface = pyaudio.PyAudio() audio_interface = pyaudio.PyAudio()
_audio_stream = audio_interface.open( _audio_stream = audio_interface.open(
format=pyaudio.paInt16, channels=1, rate=OUTPUT_SAMPLE_RATE, output=True format=pyaudio.paInt16, channels=1, rate=OUTPUT_SAMPLE_RATE, output=True