mirror of https://github.com/malarinv/tacotron2
cleanup unused code and fix packaging issues
parent
5f75aa0a0d
commit
108ce2b079
|
|
@ -0,0 +1,29 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import grpc
|
||||
from sia.proto import tts_pb2
|
||||
from sia.proto import tts_pb2_grpc
|
||||
from .tts import player_gen
|
||||
|
||||
|
||||
def tts_player():
|
||||
player = player_gen()
|
||||
channel = grpc.insecure_channel('localhost:50060')
|
||||
stub = tts_pb2_grpc.ServerStub(channel)
|
||||
|
||||
def play(t):
|
||||
test_text = tts_pb2.TextInput(text=t)
|
||||
speech = stub.TextToSpeechAPI(test_text)
|
||||
player(speech.response)
|
||||
|
||||
return play
|
||||
|
||||
|
||||
def main():
|
||||
play = tts_player()
|
||||
play('How may I help you today?')
|
||||
import pdb
|
||||
pdb.set_trace()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -0,0 +1,175 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from .hparams import create_hparams
|
||||
from .text import text_to_sequence
|
||||
from .glow import WaveGlow
|
||||
# import os
|
||||
# import soundfile as sf
|
||||
import pyaudio
|
||||
import klepto
|
||||
from librosa import resample
|
||||
from librosa.effects import time_stretch
|
||||
from sia.file_utils import cached_model_path
|
||||
from sia.instruments import do_time
|
||||
from .model import Tacotron2
|
||||
|
||||
TTS_SAMPLE_RATE = 22050
|
||||
OUTPUT_SAMPLE_RATE = 16000
|
||||
|
||||
# https://github.com/NVIDIA/waveglow/blob/master/config.json
|
||||
WAVEGLOW_CONFIG = {
|
||||
"n_mel_channels": 80,
|
||||
"n_flows": 12,
|
||||
"n_group": 8,
|
||||
"n_early_every": 4,
|
||||
"n_early_size": 2,
|
||||
"WN_config": {
|
||||
"n_layers": 8,
|
||||
"n_channels": 256,
|
||||
"kernel_size": 3
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class TTSModel(object):
|
||||
"""docstring for TTSModel."""
|
||||
|
||||
def __init__(self):
|
||||
super(TTSModel, self).__init__()
|
||||
hparams = create_hparams()
|
||||
hparams.sampling_rate = TTS_SAMPLE_RATE
|
||||
self.model = Tacotron2(hparams)
|
||||
tacotron2_path = cached_model_path("tacotron2_model")
|
||||
self.model.load_state_dict(
|
||||
torch.load(tacotron2_path, map_location='cpu')['state_dict'])
|
||||
self.model.eval()
|
||||
waveglow_path = cached_model_path('waveglow_model')
|
||||
self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
|
||||
wave_params = torch.load(waveglow_path, map_location='cpu')
|
||||
self.waveglow.load_state_dict(wave_params)
|
||||
self.waveglow.eval()
|
||||
for k in self.waveglow.convinv:
|
||||
k.float()
|
||||
self.k_cache = klepto.archives.file_archive(cached=False)
|
||||
self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
|
||||
self.synth_speech)
|
||||
|
||||
# https://github.com/NVIDIA/waveglow/issues/127
|
||||
for m in self.waveglow.modules():
|
||||
if 'Conv' in str(type(m)):
|
||||
setattr(m, 'padding_mode', 'zeros')
|
||||
|
||||
@do_time
|
||||
def synth_speech(self, t):
|
||||
text = t
|
||||
sequence = np.array(text_to_sequence(text,
|
||||
['english_cleaners']))[None, :]
|
||||
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
|
||||
mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
|
||||
sequence)
|
||||
with torch.no_grad():
|
||||
audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
|
||||
audio = audio_t[0].data.cpu().numpy()
|
||||
# data = convert(audio)
|
||||
slow_data = time_stretch(audio, 0.8)
|
||||
float_data = resample(slow_data, TTS_SAMPLE_RATE, OUTPUT_SAMPLE_RATE)
|
||||
data = float2pcm(float_data)
|
||||
return data.tobytes()
|
||||
|
||||
|
||||
# def convert(array):
|
||||
# sf.write('sample.wav', array, TTS_SAMPLE_RATE)
|
||||
# # convert to $OUTPUT_SAMPLE_RATE
|
||||
# os.system('ffmpeg -i {0} -filter:a "atempo=0.80" -ar 16k {1}'.format(
|
||||
# 'sample.wav', 'sample0.wav'))
|
||||
# data, rate = sf.read('sample0.wav', dtype='int16')
|
||||
# os.remove('sample.wav')
|
||||
# os.remove('sample0.wav')
|
||||
# return data
|
||||
|
||||
|
||||
# https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
|
||||
def float2pcm(sig, dtype='int16'):
|
||||
"""Convert floating point signal with a range from -1 to 1 to PCM.
|
||||
Any signal values outside the interval [-1.0, 1.0) are clipped.
|
||||
No dithering is used.
|
||||
Note that there are different possibilities for scaling floating
|
||||
point numbers to PCM numbers, this function implements just one of
|
||||
them. For an overview of alternatives see
|
||||
http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html
|
||||
Parameters
|
||||
----------
|
||||
sig : array_like
|
||||
Input array, must have floating point type.
|
||||
dtype : data type, optional
|
||||
Desired (integer) data type.
|
||||
Returns
|
||||
-------
|
||||
numpy.ndarray
|
||||
Integer data, scaled and clipped to the range of the given
|
||||
*dtype*.
|
||||
See Also
|
||||
--------
|
||||
pcm2float, dtype
|
||||
"""
|
||||
sig = np.asarray(sig)
|
||||
if sig.dtype.kind != 'f':
|
||||
raise TypeError("'sig' must be a float array")
|
||||
dtype = np.dtype(dtype)
|
||||
if dtype.kind not in 'iu':
|
||||
raise TypeError("'dtype' must be an integer type")
|
||||
|
||||
i = np.iinfo(dtype)
|
||||
abs_max = 2**(i.bits - 1)
|
||||
offset = i.min + abs_max
|
||||
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
|
||||
|
||||
|
||||
def display(data):
|
||||
import IPython.display as ipd
|
||||
aud = ipd.Audio(data, rate=16000)
|
||||
return aud
|
||||
|
||||
|
||||
def player_gen():
|
||||
audio_interface = pyaudio.PyAudio()
|
||||
_audio_stream = audio_interface.open(format=pyaudio.paInt16,
|
||||
channels=1,
|
||||
rate=OUTPUT_SAMPLE_RATE,
|
||||
output=True)
|
||||
|
||||
def play_device(data):
|
||||
_audio_stream.write(data)
|
||||
# _audio_stream.close()
|
||||
|
||||
return play_device
|
||||
|
||||
|
||||
def synthesize_corpus():
|
||||
tts_model = TTSModel()
|
||||
all_data = []
|
||||
for (i, line) in enumerate(open('corpus.txt').readlines()):
|
||||
print('synthesizing... "{}"'.format(line.strip()))
|
||||
data = tts_model.synth_speech(line.strip())
|
||||
all_data.append(data)
|
||||
return all_data
|
||||
|
||||
|
||||
def play_corpus(corpus_synths):
|
||||
player = player_gen()
|
||||
for d in corpus_synths:
|
||||
player(d)
|
||||
|
||||
|
||||
def main():
|
||||
corpus_synth_data = synthesize_corpus()
|
||||
play_corpus(corpus_synth_data)
|
||||
import ipdb
|
||||
ipdb.set_trace()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Loading…
Reference in New Issue