From 108ce2b079c8c976d902129ad0c9fd6f303457bf Mon Sep 17 00:00:00 2001
From: Malar Kannan <malarkannan.invention@gmail.com>
Date: Wed, 3 Jul 2019 17:26:12 +0530
Subject: [PATCH] cleanup unused code and fix packaging issues

---
 __init__.py    |   0
 demo_client.py |  29 ++++++++
 tts.py         | 175 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 204 insertions(+)
 create mode 100644 __init__.py
 create mode 100644 demo_client.py
 create mode 100644 tts.py

diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/demo_client.py b/demo_client.py
new file mode 100644
index 0000000..b132277
--- /dev/null
+++ b/demo_client.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+import grpc
+from sia.proto import tts_pb2
+from sia.proto import tts_pb2_grpc
+from .tts import player_gen
+
+
+def tts_player():
+    player = player_gen()
+    channel = grpc.insecure_channel('localhost:50060')
+    stub = tts_pb2_grpc.ServerStub(channel)
+
+    def play(t):
+        test_text = tts_pb2.TextInput(text=t)
+        speech = stub.TextToSpeechAPI(test_text)
+        player(speech.response)
+
+    return play
+
+
+def main():
+    play = tts_player()
+    play('How may I help you today?')
+    import pdb
+    pdb.set_trace()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tts.py b/tts.py
new file mode 100644
index 0000000..23bf8b2
--- /dev/null
+++ b/tts.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import torch
+from .hparams import create_hparams
+from .text import text_to_sequence
+from .glow import WaveGlow
+# import os
+# import soundfile as sf
+import pyaudio
+import klepto
+from librosa import resample
+from librosa.effects import time_stretch
+from sia.file_utils import cached_model_path
+from sia.instruments import do_time
+from .model import Tacotron2
+
+TTS_SAMPLE_RATE = 22050
+OUTPUT_SAMPLE_RATE = 16000
+
+# https://github.com/NVIDIA/waveglow/blob/master/config.json
+WAVEGLOW_CONFIG = {
+    "n_mel_channels": 80,
+    "n_flows": 12,
+    "n_group": 8,
+    "n_early_every": 4,
+    "n_early_size": 2,
+    "WN_config": {
+        "n_layers": 8,
+        "n_channels": 256,
+        "kernel_size": 3
+    }
+}
+
+
+class TTSModel(object):
+    """docstring for TTSModel."""
+
+    def __init__(self):
+        super(TTSModel, self).__init__()
+        hparams = create_hparams()
+        hparams.sampling_rate = TTS_SAMPLE_RATE
+        self.model = Tacotron2(hparams)
+        tacotron2_path = cached_model_path("tacotron2_model")
+        self.model.load_state_dict(
+            torch.load(tacotron2_path, map_location='cpu')['state_dict'])
+        self.model.eval()
+        waveglow_path = cached_model_path('waveglow_model')
+        self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
+        wave_params = torch.load(waveglow_path, map_location='cpu')
+        self.waveglow.load_state_dict(wave_params)
+        self.waveglow.eval()
+        for k in self.waveglow.convinv:
+            k.float()
+        self.k_cache = klepto.archives.file_archive(cached=False)
+        self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
+            self.synth_speech)
+
+        # https://github.com/NVIDIA/waveglow/issues/127
+        for m in self.waveglow.modules():
+            if 'Conv' in str(type(m)):
+                setattr(m, 'padding_mode', 'zeros')
+
+    @do_time
+    def synth_speech(self, t):
+        text = t
+        sequence = np.array(text_to_sequence(text,
+                                             ['english_cleaners']))[None, :]
+        sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
+        mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
+            sequence)
+        with torch.no_grad():
+            audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
+        audio = audio_t[0].data.cpu().numpy()
+        # data = convert(audio)
+        slow_data = time_stretch(audio, 0.8)
+        float_data = resample(slow_data, TTS_SAMPLE_RATE, OUTPUT_SAMPLE_RATE)
+        data = float2pcm(float_data)
+        return data.tobytes()
+
+
+# def convert(array):
+#     sf.write('sample.wav', array, TTS_SAMPLE_RATE)
+#     # convert to $OUTPUT_SAMPLE_RATE
+#     os.system('ffmpeg -i {0} -filter:a "atempo=0.80" -ar 16k {1}'.format(
+#         'sample.wav', 'sample0.wav'))
+#     data, rate = sf.read('sample0.wav', dtype='int16')
+#     os.remove('sample.wav')
+#     os.remove('sample0.wav')
+#     return data
+
+
+# https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
+def float2pcm(sig, dtype='int16'):
+    """Convert floating point signal with a range from -1 to 1 to PCM.
+    Any signal values outside the interval [-1.0, 1.0) are clipped.
+    No dithering is used.
+    Note that there are different possibilities for scaling floating
+    point numbers to PCM numbers, this function implements just one of
+    them.  For an overview of alternatives see
+    http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html
+    Parameters
+    ----------
+    sig : array_like
+        Input array, must have floating point type.
+    dtype : data type, optional
+        Desired (integer) data type.
+    Returns
+    -------
+    numpy.ndarray
+        Integer data, scaled and clipped to the range of the given
+        *dtype*.
+    See Also
+    --------
+    pcm2float, dtype
+    """
+    sig = np.asarray(sig)
+    if sig.dtype.kind != 'f':
+        raise TypeError("'sig' must be a float array")
+    dtype = np.dtype(dtype)
+    if dtype.kind not in 'iu':
+        raise TypeError("'dtype' must be an integer type")
+
+    i = np.iinfo(dtype)
+    abs_max = 2**(i.bits - 1)
+    offset = i.min + abs_max
+    return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
+
+
+def display(data):
+    import IPython.display as ipd
+    aud = ipd.Audio(data, rate=16000)
+    return aud
+
+
+def player_gen():
+    audio_interface = pyaudio.PyAudio()
+    _audio_stream = audio_interface.open(format=pyaudio.paInt16,
+                                         channels=1,
+                                         rate=OUTPUT_SAMPLE_RATE,
+                                         output=True)
+
+    def play_device(data):
+        _audio_stream.write(data)
+        # _audio_stream.close()
+
+    return play_device
+
+
+def synthesize_corpus():
+    tts_model = TTSModel()
+    all_data = []
+    for (i, line) in enumerate(open('corpus.txt').readlines()):
+        print('synthesizing... "{}"'.format(line.strip()))
+        data = tts_model.synth_speech(line.strip())
+        all_data.append(data)
+    return all_data
+
+
+def play_corpus(corpus_synths):
+    player = player_gen()
+    for d in corpus_synths:
+        player(d)
+
+
+def main():
+    corpus_synth_data = synthesize_corpus()
+    play_corpus(corpus_synth_data)
+    import ipdb
+    ipdb.set_trace()
+
+
+if __name__ == '__main__':
+    main()