cleanup unused code and fix packaging issues

master
Malar Kannan 2019-07-03 17:26:12 +05:30
parent 5f75aa0a0d
commit 108ce2b079
3 changed files with 204 additions and 0 deletions

0
__init__.py Normal file
View File

29
demo_client.py Normal file
View File

@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
import grpc
from sia.proto import tts_pb2
from sia.proto import tts_pb2_grpc
from .tts import player_gen
def tts_player():
player = player_gen()
channel = grpc.insecure_channel('localhost:50060')
stub = tts_pb2_grpc.ServerStub(channel)
def play(t):
test_text = tts_pb2.TextInput(text=t)
speech = stub.TextToSpeechAPI(test_text)
player(speech.response)
return play
def main():
play = tts_player()
play('How may I help you today?')
import pdb
pdb.set_trace()
if __name__ == '__main__':
main()

175
tts.py Normal file
View File

@ -0,0 +1,175 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import numpy as np
import torch
from .hparams import create_hparams
from .text import text_to_sequence
from .glow import WaveGlow
# import os
# import soundfile as sf
import pyaudio
import klepto
from librosa import resample
from librosa.effects import time_stretch
from sia.file_utils import cached_model_path
from sia.instruments import do_time
from .model import Tacotron2
TTS_SAMPLE_RATE = 22050
OUTPUT_SAMPLE_RATE = 16000
# https://github.com/NVIDIA/waveglow/blob/master/config.json
WAVEGLOW_CONFIG = {
"n_mel_channels": 80,
"n_flows": 12,
"n_group": 8,
"n_early_every": 4,
"n_early_size": 2,
"WN_config": {
"n_layers": 8,
"n_channels": 256,
"kernel_size": 3
}
}
class TTSModel(object):
"""docstring for TTSModel."""
def __init__(self):
super(TTSModel, self).__init__()
hparams = create_hparams()
hparams.sampling_rate = TTS_SAMPLE_RATE
self.model = Tacotron2(hparams)
tacotron2_path = cached_model_path("tacotron2_model")
self.model.load_state_dict(
torch.load(tacotron2_path, map_location='cpu')['state_dict'])
self.model.eval()
waveglow_path = cached_model_path('waveglow_model')
self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
wave_params = torch.load(waveglow_path, map_location='cpu')
self.waveglow.load_state_dict(wave_params)
self.waveglow.eval()
for k in self.waveglow.convinv:
k.float()
self.k_cache = klepto.archives.file_archive(cached=False)
self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
self.synth_speech)
# https://github.com/NVIDIA/waveglow/issues/127
for m in self.waveglow.modules():
if 'Conv' in str(type(m)):
setattr(m, 'padding_mode', 'zeros')
@do_time
def synth_speech(self, t):
text = t
sequence = np.array(text_to_sequence(text,
['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
sequence)
with torch.no_grad():
audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
audio = audio_t[0].data.cpu().numpy()
# data = convert(audio)
slow_data = time_stretch(audio, 0.8)
float_data = resample(slow_data, TTS_SAMPLE_RATE, OUTPUT_SAMPLE_RATE)
data = float2pcm(float_data)
return data.tobytes()
# def convert(array):
# sf.write('sample.wav', array, TTS_SAMPLE_RATE)
# # convert to $OUTPUT_SAMPLE_RATE
# os.system('ffmpeg -i {0} -filter:a "atempo=0.80" -ar 16k {1}'.format(
# 'sample.wav', 'sample0.wav'))
# data, rate = sf.read('sample0.wav', dtype='int16')
# os.remove('sample.wav')
# os.remove('sample0.wav')
# return data
# https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
def float2pcm(sig, dtype='int16'):
"""Convert floating point signal with a range from -1 to 1 to PCM.
Any signal values outside the interval [-1.0, 1.0) are clipped.
No dithering is used.
Note that there are different possibilities for scaling floating
point numbers to PCM numbers, this function implements just one of
them. For an overview of alternatives see
http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html
Parameters
----------
sig : array_like
Input array, must have floating point type.
dtype : data type, optional
Desired (integer) data type.
Returns
-------
numpy.ndarray
Integer data, scaled and clipped to the range of the given
*dtype*.
See Also
--------
pcm2float, dtype
"""
sig = np.asarray(sig)
if sig.dtype.kind != 'f':
raise TypeError("'sig' must be a float array")
dtype = np.dtype(dtype)
if dtype.kind not in 'iu':
raise TypeError("'dtype' must be an integer type")
i = np.iinfo(dtype)
abs_max = 2**(i.bits - 1)
offset = i.min + abs_max
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
def display(data):
import IPython.display as ipd
aud = ipd.Audio(data, rate=16000)
return aud
def player_gen():
audio_interface = pyaudio.PyAudio()
_audio_stream = audio_interface.open(format=pyaudio.paInt16,
channels=1,
rate=OUTPUT_SAMPLE_RATE,
output=True)
def play_device(data):
_audio_stream.write(data)
# _audio_stream.close()
return play_device
def synthesize_corpus():
tts_model = TTSModel()
all_data = []
for (i, line) in enumerate(open('corpus.txt').readlines()):
print('synthesizing... "{}"'.format(line.strip()))
data = tts_model.synth_speech(line.strip())
all_data.append(data)
return all_data
def play_corpus(corpus_synths):
player = player_gen()
for d in corpus_synths:
player(d)
def main():
corpus_synth_data = synthesize_corpus()
play_corpus(corpus_synth_data)
import ipdb
ipdb.set_trace()
if __name__ == '__main__':
main()