1
0
mirror of https://github.com/malarinv/tacotron2 synced 2026-03-08 01:32:35 +00:00

11 Commits

Author SHA1 Message Date
6d3679d760 relax scipy version 2021-07-02 23:21:45 +05:30
a851e80db2 1. added rpyc server optional package
2. updated path variable
2020-03-05 16:59:55 +05:30
cb0c8ddd06 1. make pyaudio as extra requirement
2. warn if pyaudio not installed if player_gen is used
2020-01-22 14:09:37 +05:30
42a85d177e added glow as module 2019-11-29 00:20:01 +05:30
5efb1e2758 1. fix synth_speech_fast interface
2. rename private methods
2019-11-28 18:11:47 +05:30
ea11c5199e tested gl/wavglow working 2019-11-28 18:00:52 +05:30
78eed2d295 bugfix for vocoder arg 2019-11-28 17:03:46 +05:30
009b87e716 update dependency version and add speed/sample rate and vocoder args 2019-11-28 16:57:36 +05:30
ac5ffcf6d5 enable gpu support if available 2019-11-27 22:53:41 +05:30
5a30069f0a update tempo and output sample rate 2019-10-09 17:25:51 +05:30
dcc9ab3625 add tts cli args 2019-10-09 16:23:21 +05:30
10 changed files with 282 additions and 98 deletions

View File

@@ -29,13 +29,14 @@ import torch
from torch.autograd import Variable from torch.autograd import Variable
import torch.nn.functional as F import torch.nn.functional as F
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
@torch.jit.script @torch.jit.script
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
n_channels_int = n_channels[0] n_channels_int = n_channels[0]
in_act = input_a+input_b in_act = input_a+input_b
t_act = torch.tanh(in_act[:, :n_channels_int, :]) t_act = torch.nn.functional.tanh(in_act[:, :n_channels_int, :])
s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) s_act = torch.nn.functional.sigmoid(in_act[:, n_channels_int:, :])
acts = t_act * s_act acts = t_act * s_act
return acts return acts
@@ -90,7 +91,7 @@ class Invertible1x1Conv(torch.nn.Module):
# Reverse computation # Reverse computation
W_inverse = W.float().inverse() W_inverse = W.float().inverse()
W_inverse = Variable(W_inverse[..., None]) W_inverse = Variable(W_inverse[..., None])
if z.type() == 'torch.HalfTensor': if z.type() == 'torch.cuda.HalfTensor' or z.type() == 'torch.HalfTensor':
W_inverse = W_inverse.half() W_inverse = W_inverse.half()
self.W_inverse = W_inverse self.W_inverse = W_inverse
z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0)
@@ -117,6 +118,7 @@ class WN(torch.nn.Module):
self.n_channels = n_channels self.n_channels = n_channels
self.in_layers = torch.nn.ModuleList() self.in_layers = torch.nn.ModuleList()
self.res_skip_layers = torch.nn.ModuleList() self.res_skip_layers = torch.nn.ModuleList()
self.cond_layers = torch.nn.ModuleList()
start = torch.nn.Conv1d(n_in_channels, n_channels, 1) start = torch.nn.Conv1d(n_in_channels, n_channels, 1)
start = torch.nn.utils.weight_norm(start, name='weight') start = torch.nn.utils.weight_norm(start, name='weight')
@@ -129,9 +131,6 @@ class WN(torch.nn.Module):
end.bias.data.zero_() end.bias.data.zero_()
self.end = end self.end = end
cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels*n_layers, 1)
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
for i in range(n_layers): for i in range(n_layers):
dilation = 2 ** i dilation = 2 ** i
padding = int((kernel_size*dilation - dilation)/2) padding = int((kernel_size*dilation - dilation)/2)
@@ -140,6 +139,9 @@ class WN(torch.nn.Module):
in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
self.in_layers.append(in_layer) self.in_layers.append(in_layer)
cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels, 1)
cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
self.cond_layers.append(cond_layer)
# last one is not necessary # last one is not necessary
if i < n_layers - 1: if i < n_layers - 1:
@@ -153,25 +155,24 @@ class WN(torch.nn.Module):
def forward(self, forward_input): def forward(self, forward_input):
audio, spect = forward_input audio, spect = forward_input
audio = self.start(audio) audio = self.start(audio)
output = torch.zeros_like(audio)
n_channels_tensor = torch.IntTensor([self.n_channels])
spect = self.cond_layer(spect)
for i in range(self.n_layers): for i in range(self.n_layers):
spect_offset = i*2*self.n_channels
acts = fused_add_tanh_sigmoid_multiply( acts = fused_add_tanh_sigmoid_multiply(
self.in_layers[i](audio), self.in_layers[i](audio),
spect[:,spect_offset:spect_offset+2*self.n_channels,:], self.cond_layers[i](spect),
n_channels_tensor) torch.IntTensor([self.n_channels]))
res_skip_acts = self.res_skip_layers[i](acts) res_skip_acts = self.res_skip_layers[i](acts)
if i < self.n_layers - 1: if i < self.n_layers - 1:
audio = audio + res_skip_acts[:,:self.n_channels,:] audio = res_skip_acts[:,:self.n_channels,:] + audio
output = output + res_skip_acts[:,self.n_channels:,:] skip_acts = res_skip_acts[:,self.n_channels:,:]
else: else:
output = output + res_skip_acts skip_acts = res_skip_acts
if i == 0:
output = skip_acts
else:
output = skip_acts + output
return self.end(output) return self.end(output)
@@ -257,14 +258,24 @@ class WaveGlow(torch.nn.Module):
spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3)
spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1)
if spect.type() == 'torch.HalfTensor': if torch.cuda.is_available():
audio = torch.HalfTensor(spect.size(0), if spect.type() == 'torch.cuda.HalfTensor':
self.n_remaining_channels, audio = torch.cuda.HalfTensor(spect.size(0),
spect.size(2)).normal_() self.n_remaining_channels,
spect.size(2)).normal_()
else:
audio = torch.cuda.FloatTensor(spect.size(0),
self.n_remaining_channels,
spect.size(2)).normal_()
else: else:
audio = torch.FloatTensor(spect.size(0), if spect.type() == 'torch.HalfTensor':
self.n_remaining_channels, audio = torch.HalfTensor(spect.size(0),
spect.size(2)).normal_() self.n_remaining_channels,
spect.size(2)).normal_()
else:
audio = torch.FloatTensor(spect.size(0),
self.n_remaining_channels,
spect.size(2)).normal_()
audio = torch.autograd.Variable(sigma*audio) audio = torch.autograd.Variable(sigma*audio)
@@ -274,7 +285,6 @@ class WaveGlow(torch.nn.Module):
audio_1 = audio[:,n_half:,:] audio_1 = audio[:,n_half:,:]
output = self.WN[k]((audio_0, spect)) output = self.WN[k]((audio_0, spect))
s = output[:, n_half:, :] s = output[:, n_half:, :]
b = output[:, :n_half, :] b = output[:, :n_half, :]
audio_1 = (audio_1 - b)/torch.exp(s) audio_1 = (audio_1 - b)/torch.exp(s)
@@ -283,10 +293,16 @@ class WaveGlow(torch.nn.Module):
audio = self.convinv[k](audio, reverse=True) audio = self.convinv[k](audio, reverse=True)
if k % self.n_early_every == 0 and k > 0: if k % self.n_early_every == 0 and k > 0:
if spect.type() == 'torch.HalfTensor': if torch.cuda.is_available():
z = torch.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_() if spect.type() == 'torch.cuda.HalfTensor':
z = torch.cuda.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
else:
z = torch.cuda.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
else: else:
z = torch.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_() if spect.type() == 'torch.HalfTensor':
z = torch.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
else:
z = torch.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_()
audio = torch.cat((sigma*z, audio),1) audio = torch.cat((sigma*z, audio),1)
audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data
@@ -298,7 +314,7 @@ class WaveGlow(torch.nn.Module):
for WN in waveglow.WN: for WN in waveglow.WN:
WN.start = torch.nn.utils.remove_weight_norm(WN.start) WN.start = torch.nn.utils.remove_weight_norm(WN.start)
WN.in_layers = remove(WN.in_layers) WN.in_layers = remove(WN.in_layers)
WN.cond_layer = torch.nn.utils.remove_weight_norm(WN.cond_layer) WN.cond_layers = remove(WN.cond_layers)
WN.res_skip_layers = remove(WN.res_skip_layers) WN.res_skip_layers = remove(WN.res_skip_layers)
return waveglow return waveglow

View File

@@ -12,15 +12,22 @@ with open("HISTORY.rst") as history_file:
requirements = [ requirements = [
"klepto==0.1.6", "klepto==0.1.6",
"numpy==1.16.4", "numpy~=1.16.4",
"inflect==0.2.5", "inflect==0.2.5",
"librosa==0.6.0", "librosa==0.6.0",
"scipy==1.3.0", "scipy~=1.3",
"Unidecode==1.0.22", "Unidecode==1.0.22",
"torch==1.1.0", "torch~=1.1.0",
"PyAudio==0.2.11"
] ]
extra_requirements = {
"playback": ["PyAudio==0.2.11"],
"server": [
"google-cloud-texttospeech==1.0.1",
"rpyc==4.1.4",
],
}
setup_requirements = ["pytest-runner"] setup_requirements = ["pytest-runner"]
test_requirements = ["pytest"] test_requirements = ["pytest"]
@@ -44,6 +51,7 @@ setup(
], ],
description="Taco2 TTS package.", description="Taco2 TTS package.",
install_requires=requirements, install_requires=requirements,
extras_require=extra_requirements,
long_description=readme + "\n\n" + history, long_description=readme + "\n\n" + history,
include_package_data=True, include_package_data=True,
keywords="tacotron2 tts", keywords="tacotron2 tts",
@@ -53,7 +61,12 @@ setup(
test_suite="tests", test_suite="tests",
tests_require=test_requirements, tests_require=test_requirements,
url="https://github.com/malarinv/tacotron2", url="https://github.com/malarinv/tacotron2",
version="0.2.0", version="0.3.0",
zip_safe=False, zip_safe=False,
entry_points={"console_scripts": ("tts_debug = taco2.tts:main",)}, entry_points={
"console_scripts": (
"tts_debug = taco2.tts:main",
"tts_rpyc_server = taco2.server.__main__:main",
)
},
) )

View File

@@ -9,9 +9,14 @@ class Denoiser(torch.nn.Module):
def __init__(self, waveglow, filter_length=1024, n_overlap=4, def __init__(self, waveglow, filter_length=1024, n_overlap=4,
win_length=1024, mode='zeros', n_mel_channels=80,): win_length=1024, mode='zeros', n_mel_channels=80,):
super(Denoiser, self).__init__() super(Denoiser, self).__init__()
self.stft = STFT(filter_length=filter_length, if torch.cuda.is_available():
hop_length=int(filter_length/n_overlap), self.stft = STFT(filter_length=filter_length,
win_length=win_length).cpu() hop_length=int(filter_length/n_overlap),
win_length=win_length).cuda()
else:
self.stft = STFT(filter_length=filter_length,
hop_length=int(filter_length/n_overlap),
win_length=win_length).cpu()
if mode == 'zeros': if mode == 'zeros':
mel_input = torch.zeros( mel_input = torch.zeros(
(1, n_mel_channels, 88), (1, n_mel_channels, 88),
@@ -32,7 +37,10 @@ class Denoiser(torch.nn.Module):
self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None]) self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None])
def forward(self, audio, strength=0.1): def forward(self, audio, strength=0.1):
audio_spec, audio_angles = self.stft.transform(audio.cpu().float()) if torch.cuda.is_available():
audio_spec, audio_angles = self.stft.transform(audio.cuda().float())
else:
audio_spec, audio_angles = self.stft.transform(audio.cpu().float())
audio_spec_denoised = audio_spec - self.bias_spec * strength audio_spec_denoised = audio_spec - self.bias_spec * strength
audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0) audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles) audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles)

View File

@@ -35,13 +35,13 @@ class HParams(object):
# Audio Parameters # # Audio Parameters #
################################ ################################
max_wav_value = 32768.0 max_wav_value = 32768.0
sampling_rate = 16000 sampling_rate = 22050
filter_length = 1024 filter_length = 1024
hop_length = 256 hop_length = 256
win_length = 1024 win_length = 1024
n_mel_channels: int = 40 n_mel_channels: int = 80
mel_fmin: float = 0.0 mel_fmin: float = 0.0
mel_fmax: float = 4000.0 mel_fmax: float = 8000.0
################################ ################################
# Model Parameters # # Model Parameters #
################################ ################################

0
taco2/server/__init__.py Normal file
View File

48
taco2/server/__main__.py Normal file
View File

@@ -0,0 +1,48 @@
import os
import logging
import rpyc
from rpyc.utils.server import ThreadedServer
from .backend import TTSSynthesizer
tts_backend = os.environ.get("TTS_BACKEND", "taco2")
tts_synthesizer = TTSSynthesizer(backend=tts_backend)
class TTSService(rpyc.Service):
def on_connect(self, conn):
# code that runs when a connection is created
# (to init the service, if needed)
pass
def on_disconnect(self, conn):
# code that runs after the connection has already closed
# (to finalize the service, if needed)
pass
def exposed_synth_speech(self, utterance: str): # this is an exposed method
speech_audio = tts_synthesizer.synth_speech(utterance)
return speech_audio
def exposed_synth_speech_cb(
self, utterance: str, respond
): # this is an exposed method
speech_audio = tts_synthesizer.synth_speech(utterance)
respond(speech_audio)
def main():
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
port = int(os.environ.get("TTS_RPYC_PORT", "7754"))
logging.info("starting tts server...")
t = ThreadedServer(TTSService, port=port)
t.start()
if __name__ == "__main__":
main()

45
taco2/server/backend.py Normal file
View File

@@ -0,0 +1,45 @@
import os
from google.cloud import texttospeech
from ..tts import TTSModel
tts_model_weights = os.environ.get(
"TTS_MODELS", "models/tacotron2_statedict.pt,models/waveglow_256channels.pt"
)
tts_creds = os.environ.get(
"GOOGLE_APPLICATION_CREDENTIALS", "/code/config/gre2e/keys/gre2e_gcp.json"
)
taco2, wav_glow = tts_model_weights.split(",", 1)
class TTSSynthesizer(object):
"""docstring for TTSSynthesizer."""
def __init__(self, backend="taco2"):
super(TTSSynthesizer, self).__init__()
if backend == "taco2":
tts_model = TTSModel(f"{taco2}", f"{wav_glow}") # Loads the models
self.synth_speech = tts_model.synth_speech
elif backend == "gcp":
client = texttospeech.TextToSpeechClient()
# Build the voice request, select the language code ("en-US") and the ssml
# voice gender ("neutral")
voice = texttospeech.types.VoiceSelectionParams(language_code="en-US")
# Select the type of audio file you want returned
audio_config = texttospeech.types.AudioConfig(
audio_encoding=texttospeech.enums.AudioEncoding.LINEAR16
)
# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
def gcp_synthesize(speech_text):
synthesis_input = texttospeech.types.SynthesisInput(text=speech_text)
response = client.synthesize_speech(
synthesis_input, voice, audio_config
)
return response.audio_content
self.synth_speech = gcp_synthesize

View File

@@ -40,6 +40,7 @@ from scipy.signal import get_window
from librosa.util import pad_center, tiny from librosa.util import pad_center, tiny
from .audio_processing import window_sumsquare from .audio_processing import window_sumsquare
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
class STFT(torch.nn.Module): class STFT(torch.nn.Module):
""" """
@@ -83,8 +84,8 @@ class STFT(torch.nn.Module):
forward_basis *= fft_window forward_basis *= fft_window
inverse_basis *= fft_window inverse_basis *= fft_window
self.register_buffer("forward_basis", forward_basis.float()) self.register_buffer("forward_basis", forward_basis.float().to(DEVICE))
self.register_buffer("inverse_basis", inverse_basis.float()) self.register_buffer("inverse_basis", inverse_basis.float().to(DEVICE))
def transform(self, input_data): def transform(self, input_data):
num_batches = input_data.size(0) num_batches = input_data.size(0)
@@ -120,10 +121,10 @@ class STFT(torch.nn.Module):
return magnitude, phase return magnitude, phase
def inverse(self, magnitude, phase): def inverse(self, magnitude, phase):
phase = phase.to(DEVICE)
recombine_magnitude_phase = torch.cat( recombine_magnitude_phase = torch.cat(
[magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1 [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
) )
inverse_transform = F.conv_transpose1d( inverse_transform = F.conv_transpose1d(
recombine_magnitude_phase, recombine_magnitude_phase,
Variable(self.inverse_basis, requires_grad=False), Variable(self.inverse_basis, requires_grad=False),
@@ -143,13 +144,10 @@ class STFT(torch.nn.Module):
# remove modulation effects # remove modulation effects
approx_nonzero_indices = torch.from_numpy( approx_nonzero_indices = torch.from_numpy(
np.where(window_sum > tiny(window_sum))[0] np.where(window_sum > tiny(window_sum))[0]
) ).to(DEVICE)
window_sum = torch.autograd.Variable( window_sum = torch.autograd.Variable(
torch.from_numpy(window_sum), requires_grad=False torch.from_numpy(window_sum), requires_grad=False
) ).to(DEVICE)
# window_sum = window_sum.cuda() if magnitude.is_cuda else
# window_sum
# initially not commented out
inverse_transform[:, :, approx_nonzero_indices] /= window_sum[ inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
approx_nonzero_indices approx_nonzero_indices
] ]

View File

@@ -3,8 +3,10 @@
import numpy as np import numpy as np
import torch import torch
import pyaudio
import klepto import klepto
import argparse
import warnings
from pathlib import Path
from .model import Tacotron2 from .model import Tacotron2
from glow import WaveGlow from glow import WaveGlow
from .hparams import HParams from .hparams import HParams
@@ -13,8 +15,9 @@ from .text import text_to_sequence
from .denoiser import Denoiser from .denoiser import Denoiser
from .audio_processing import griffin_lim, postprocess_audio from .audio_processing import griffin_lim, postprocess_audio
TTS_SAMPLE_RATE = 22050 OUTPUT_SAMPLE_RATE = 22050
OUTPUT_SAMPLE_RATE = 16000 GL_ITERS = 30
VOCODER_WAVEGLOW, VOCODER_GL = "wavglow", "gl"
# config from # config from
# https://github.com/NVIDIA/waveglow/blob/master/config.json # https://github.com/NVIDIA/waveglow/blob/master/config.json
@@ -34,22 +37,31 @@ class TTSModel(object):
def __init__(self, tacotron2_path, waveglow_path, **kwargs): def __init__(self, tacotron2_path, waveglow_path, **kwargs):
super(TTSModel, self).__init__() super(TTSModel, self).__init__()
hparams = HParams(**kwargs) hparams = HParams(**kwargs)
hparams.sampling_rate = TTS_SAMPLE_RATE self.hparams = hparams
self.model = Tacotron2(hparams) self.model = Tacotron2(hparams)
self.model.load_state_dict( if torch.cuda.is_available():
torch.load(tacotron2_path, map_location="cpu")["state_dict"] self.model.load_state_dict(torch.load(tacotron2_path)["state_dict"])
) self.model.cuda().eval()
self.model.eval() else:
self.model.load_state_dict(
torch.load(tacotron2_path, map_location="cpu")["state_dict"]
)
self.model.eval()
self.k_cache = klepto.archives.file_archive(cached=False) self.k_cache = klepto.archives.file_archive(cached=False)
if waveglow_path: if waveglow_path:
wave_params = torch.load(waveglow_path, map_location="cpu") if torch.cuda.is_available():
wave_params = torch.load(waveglow_path)
else:
wave_params = torch.load(waveglow_path, map_location="cpu")
try: try:
self.waveglow = WaveGlow(**WAVEGLOW_CONFIG) self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
self.waveglow.load_state_dict(wave_params) self.waveglow.load_state_dict(wave_params)
self.waveglow.eval()
except: except:
self.waveglow = wave_params["model"] self.waveglow = wave_params["model"]
self.waveglow = self.waveglow.remove_weightnorm(self.waveglow) self.waveglow = self.waveglow.remove_weightnorm(self.waveglow)
if torch.cuda.is_available():
self.waveglow.cuda().eval()
else:
self.waveglow.eval() self.waveglow.eval()
# workaround from # workaround from
# https://github.com/NVIDIA/waveglow/issues/127 # https://github.com/NVIDIA/waveglow/issues/127
@@ -57,16 +69,16 @@ class TTSModel(object):
if "Conv" in str(type(m)): if "Conv" in str(type(m)):
setattr(m, "padding_mode", "zeros") setattr(m, "padding_mode", "zeros")
for k in self.waveglow.convinv: for k in self.waveglow.convinv:
k.float() k.float().half()
self.denoiser = Denoiser( self.denoiser = Denoiser(
self.waveglow, n_mel_channels=hparams.n_mel_channels self.waveglow, n_mel_channels=hparams.n_mel_channels
) )
self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)( self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
self.synth_speech self._synth_speech
) )
else: else:
self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)( self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
self.synth_speech_gl self._synth_speech_fast
) )
self.taco_stft = TacotronSTFT( self.taco_stft = TacotronSTFT(
hparams.filter_length, hparams.filter_length,
@@ -77,50 +89,78 @@ class TTSModel(object):
mel_fmax=4000, mel_fmax=4000,
) )
def generate_mel_postnet(self, text): def _generate_mel_postnet(self, text):
sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :] sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :]
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long() if torch.cuda.is_available():
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
else:
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
with torch.no_grad(): with torch.no_grad():
mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference( mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
sequence sequence
) )
return mel_outputs_postnet return mel_outputs_postnet
def synth_speech(self, text): def synth_speech_array(self, text, vocoder):
mel_outputs_postnet = self.generate_mel_postnet(text) mel_outputs_postnet = self._generate_mel_postnet(text)
with torch.no_grad(): if vocoder == VOCODER_WAVEGLOW:
audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666) with torch.no_grad():
audio_t = self.denoiser(audio_t, 0.1)[0] audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
audio = audio_t[0].data.cpu().numpy() audio_t = self.denoiser(audio_t, 0.1)[0]
audio = audio_t[0].data
return postprocess_audio( elif vocoder == VOCODER_GL:
audio, src_rate=TTS_SAMPLE_RATE, dst_rate=OUTPUT_SAMPLE_RATE mel_decompress = self.taco_stft.spectral_de_normalize(mel_outputs_postnet)
) mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
spec_from_mel_scaling = 1000
def synth_speech_gl(self, text, griffin_iters=60): spec_from_mel = torch.mm(mel_decompress[0], self.taco_stft.mel_basis)
mel_outputs_postnet = self.generate_mel_postnet(text) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
spec_from_mel = spec_from_mel * spec_from_mel_scaling
mel_decompress = self.taco_stft.spectral_de_normalize(mel_outputs_postnet) spec_from_mel = (
mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel.cuda() if torch.cuda.is_available() else spec_from_mel
spec_from_mel_scaling = 1000 )
spec_from_mel = torch.mm(mel_decompress[0], self.taco_stft.mel_basis) audio = griffin_lim(
spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) torch.autograd.Variable(spec_from_mel[:, :, :-1]),
spec_from_mel = spec_from_mel * spec_from_mel_scaling self.taco_stft.stft_fn,
audio = griffin_lim( GL_ITERS,
torch.autograd.Variable(spec_from_mel[:, :, :-1]), )
self.taco_stft.stft_fn, audio = audio.squeeze()
griffin_iters, else:
) raise ValueError("vocoder arg should be one of [wavglow|gl]")
audio = audio.squeeze()
audio = audio.cpu().numpy() audio = audio.cpu().numpy()
return audio
def _synth_speech(
self, text, speed: float = 1.0, sample_rate: int = OUTPUT_SAMPLE_RATE
):
audio = self.synth_speech_array(text, VOCODER_WAVEGLOW)
return postprocess_audio( return postprocess_audio(
audio, src_rate=TTS_SAMPLE_RATE, dst_rate=OUTPUT_SAMPLE_RATE audio,
src_rate=self.hparams.sampling_rate,
dst_rate=sample_rate,
tempo=speed,
)
def _synth_speech_fast(
self, text, speed: float = 1.0, sample_rate: int = OUTPUT_SAMPLE_RATE
):
audio = self.synth_speech_array(text, VOCODER_GL)
return postprocess_audio(
audio,
tempo=speed,
src_rate=self.hparams.sampling_rate,
dst_rate=sample_rate,
) )
def player_gen(): def player_gen():
try:
import pyaudio
except ModuleNotFoundError:
warnings.warn("module 'pyaudio' is not installed requried for playback")
return
audio_interface = pyaudio.PyAudio() audio_interface = pyaudio.PyAudio()
_audio_stream = audio_interface.open( _audio_stream = audio_interface.open(
format=pyaudio.paInt16, channels=1, rate=OUTPUT_SAMPLE_RATE, output=True format=pyaudio.paInt16, channels=1, rate=OUTPUT_SAMPLE_RATE, output=True
@@ -133,10 +173,7 @@ def player_gen():
return play_device return play_device
def repl(): def repl(tts_model):
tts_model = TTSModel(
"/Users/malar/Work/tacotron2_r4_83000.pt", "/Users/malar/Work/waveglow_484000"
)
player = player_gen() player = player_gen()
def loop(): def loop():
@@ -148,7 +185,26 @@ def repl():
def main(): def main():
interactive_loop = repl() parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"-t",
"--tacotron2_path",
type=Path,
default="./tacotron.pt",
help="Path to a tacotron2 model",
)
parser.add_argument(
"-w",
"--waveglow_path",
type=Path,
default="./waveglow_256channels.pt",
help="Path to a waveglow model",
)
args = parser.parse_args()
tts_model = TTSModel(**vars(args))
interactive_loop = repl(tts_model)
while True: while True:
interactive_loop() interactive_loop()

View File

@@ -27,6 +27,6 @@ def load_filepaths_and_text(filename, split="|"):
def to_gpu(x): def to_gpu(x):
x = x.contiguous() x = x.contiguous()
# if torch.cuda.is_available(): #initially not commented out if torch.cuda.is_available(): #initially not commented out
# x = x.cuda(non_blocking=True) # initially not commented out x = x.cuda(non_blocking=True) # initially not commented out
return torch.autograd.Variable(x) return torch.autograd.Variable(x)