tacotron2/tts.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
import torch
from .hparams import create_hparams
from .text import text_to_sequence
from .glow import WaveGlow

# import os
# import soundfile as sf
import pyaudio
import klepto
from librosa import resample
from librosa.effects import time_stretch
from sia.file_utils import cached_model_path
from sia.instruments import do_time
from .model import Tacotron2

TTS_SAMPLE_RATE = 22050
OUTPUT_SAMPLE_RATE = 16000

# config from
# https://github.com/NVIDIA/waveglow/blob/master/config.json
WAVEGLOW_CONFIG = {
    "n_mel_channels": 80,
    "n_flows": 12,
    "n_group": 8,
    "n_early_every": 4,
    "n_early_size": 2,
    "WN_config": {"n_layers": 8, "n_channels": 256, "kernel_size": 3},
}


class TTSModel(object):
    """docstring for TTSModel."""

    def __init__(self):
        super(TTSModel, self).__init__()
        hparams = create_hparams()
        hparams.sampling_rate = TTS_SAMPLE_RATE
        self.model = Tacotron2(hparams)
        tacotron2_path = cached_model_path("tacotron2_model")
        self.model.load_state_dict(
            torch.load(tacotron2_path, map_location="cpu")["state_dict"]
        )
        self.model.eval()
        waveglow_path = cached_model_path("waveglow_model")
        self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
        wave_params = torch.load(waveglow_path, map_location="cpu")
        self.waveglow.load_state_dict(wave_params)
        self.waveglow.eval()
        for k in self.waveglow.convinv:
            k.float()
        self.k_cache = klepto.archives.file_archive(cached=False)
        self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
            self.synth_speech
        )
        # workaround from
        # https://github.com/NVIDIA/waveglow/issues/127
        for m in self.waveglow.modules():
            if "Conv" in str(type(m)):
                setattr(m, "padding_mode", "zeros")

    @do_time
    def synth_speech(self, t):
        text = t
        sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[
            None, :
        ]
        sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
        mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
            sequence
        )
        with torch.no_grad():
            audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
        audio = audio_t[0].data.cpu().numpy()
        # data = convert(audio)
        slow_data = time_stretch(audio, 0.8)
        float_data = resample(slow_data, TTS_SAMPLE_RATE, OUTPUT_SAMPLE_RATE)
        data = float2pcm(float_data)
        return data.tobytes()


# adapted from
# https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
def float2pcm(sig, dtype="int16"):
    """Convert floating point signal with a range from -1 to 1 to PCM.
    Any signal values outside the interval [-1.0, 1.0) are clipped.
    No dithering is used.
    Note that there are different possibilities for scaling floating
    point numbers to PCM numbers, this function implements just one of
    them.  For an overview of alternatives see
    http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html
    Parameters
    ----------
    sig : array_like
        Input array, must have floating point type.
    dtype : data type, optional
        Desired (integer) data type.
    Returns
    -------
    numpy.ndarray
        Integer data, scaled and clipped to the range of the given
        *dtype*.
    See Also
    --------
    pcm2float, dtype
    """
    sig = np.asarray(sig)
    if sig.dtype.kind != "f":
        raise TypeError("'sig' must be a float array")
    dtype = np.dtype(dtype)
    if dtype.kind not in "iu":
        raise TypeError("'dtype' must be an integer type")

    i = np.iinfo(dtype)
    abs_max = 2 ** (i.bits - 1)
    offset = i.min + abs_max
    return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)


def display(data):
    import IPython.display as ipd

    aud = ipd.Audio(data, rate=16000)
    return aud


def player_gen():
    audio_interface = pyaudio.PyAudio()
    _audio_stream = audio_interface.open(
        format=pyaudio.paInt16,
        channels=1,
        rate=OUTPUT_SAMPLE_RATE,
        output=True,
    )

    def play_device(data):
        _audio_stream.write(data)
        # _audio_stream.close()

    return play_device


def synthesize_corpus():
    tts_model = TTSModel()
    all_data = []
    for (i, line) in enumerate(open("corpus.txt").readlines()):
        print('synthesizing... "{}"'.format(line.strip()))
        data = tts_model.synth_speech(line.strip())
        all_data.append(data)
    return all_data


def play_corpus(corpus_synths):
    player = player_gen()
    for d in corpus_synths:
        player(d)


def main():
    corpus_synth_data = synthesize_corpus()
    play_corpus(corpus_synth_data)
    import pdb

    pdb.set_trace()


if __name__ == "__main__":
    main()
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`

			`import numpy as np`
			`import torch`
			`from .hparams import create_hparams`
			`from .text import text_to_sequence`
			`from .glow import WaveGlow`
move codestyle to black 2019-07-03 12:40:16 +00:00
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`# import os`
			`# import soundfile as sf`
			`import pyaudio`
			`import klepto`
			`from librosa import resample`
			`from librosa.effects import time_stretch`
			`from sia.file_utils import cached_model_path`
			`from sia.instruments import do_time`
			`from .model import Tacotron2`

			`TTS_SAMPLE_RATE = 22050`
			`OUTPUT_SAMPLE_RATE = 16000`

update comments 2019-07-05 09:03:04 +00:00			`# config from`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`# https://github.com/NVIDIA/waveglow/blob/master/config.json`
			`WAVEGLOW_CONFIG = {`
			`"n_mel_channels": 80,`
			`"n_flows": 12,`
			`"n_group": 8,`
			`"n_early_every": 4,`
			`"n_early_size": 2,`
move codestyle to black 2019-07-03 12:40:16 +00:00			`"WN_config": {"n_layers": 8, "n_channels": 256, "kernel_size": 3},`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`}`


			`class TTSModel(object):`
			`"""docstring for TTSModel."""`

			`def __init__(self):`
			`super(TTSModel, self).__init__()`
			`hparams = create_hparams()`
			`hparams.sampling_rate = TTS_SAMPLE_RATE`
			`self.model = Tacotron2(hparams)`
			`tacotron2_path = cached_model_path("tacotron2_model")`
			`self.model.load_state_dict(`
move codestyle to black 2019-07-03 12:40:16 +00:00			`torch.load(tacotron2_path, map_location="cpu")["state_dict"]`
			`)`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`self.model.eval()`
move codestyle to black 2019-07-03 12:40:16 +00:00			`waveglow_path = cached_model_path("waveglow_model")`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)`
move codestyle to black 2019-07-03 12:40:16 +00:00			`wave_params = torch.load(waveglow_path, map_location="cpu")`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`self.waveglow.load_state_dict(wave_params)`
			`self.waveglow.eval()`
			`for k in self.waveglow.convinv:`
			`k.float()`
			`self.k_cache = klepto.archives.file_archive(cached=False)`
			`self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(`
move codestyle to black 2019-07-03 12:40:16 +00:00			`self.synth_speech`
			`)`
update comments 2019-07-05 09:03:04 +00:00			`# workaround from`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`# https://github.com/NVIDIA/waveglow/issues/127`
			`for m in self.waveglow.modules():`
move codestyle to black 2019-07-03 12:40:16 +00:00			`if "Conv" in str(type(m)):`
			`setattr(m, "padding_mode", "zeros")`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00
			`@do_time`
			`def synth_speech(self, t):`
			`text = t`
move codestyle to black 2019-07-03 12:40:16 +00:00			`sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[`
			`None, :`
			`]`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()`
			`mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(`
move codestyle to black 2019-07-03 12:40:16 +00:00			`sequence`
			`)`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`with torch.no_grad():`
			`audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)`
			`audio = audio_t[0].data.cpu().numpy()`
			`# data = convert(audio)`
			`slow_data = time_stretch(audio, 0.8)`
			`float_data = resample(slow_data, TTS_SAMPLE_RATE, OUTPUT_SAMPLE_RATE)`
			`data = float2pcm(float_data)`
			`return data.tobytes()`


update comments 2019-07-05 09:03:04 +00:00			`# adapted from`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`# https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py`
move codestyle to black 2019-07-03 12:40:16 +00:00			`def float2pcm(sig, dtype="int16"):`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`"""Convert floating point signal with a range from -1 to 1 to PCM.`
			`Any signal values outside the interval [-1.0, 1.0) are clipped.`
			`No dithering is used.`
			`Note that there are different possibilities for scaling floating`
			`point numbers to PCM numbers, this function implements just one of`
			`them. For an overview of alternatives see`
			`http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html`
			`Parameters`
			`----------`
			`sig : array_like`
			`Input array, must have floating point type.`
			`dtype : data type, optional`
			`Desired (integer) data type.`
			`Returns`
			`-------`
			`numpy.ndarray`
			`Integer data, scaled and clipped to the range of the given`
			`dtype.`
			`See Also`
			`--------`
			`pcm2float, dtype`
			`"""`
			`sig = np.asarray(sig)`
move codestyle to black 2019-07-03 12:40:16 +00:00			`if sig.dtype.kind != "f":`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`raise TypeError("'sig' must be a float array")`
			`dtype = np.dtype(dtype)`
move codestyle to black 2019-07-03 12:40:16 +00:00			`if dtype.kind not in "iu":`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`raise TypeError("'dtype' must be an integer type")`

			`i = np.iinfo(dtype)`
move codestyle to black 2019-07-03 12:40:16 +00:00			`abs_max = 2 ** (i.bits - 1)`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`offset = i.min + abs_max`
			`return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)`


			`def display(data):`
			`import IPython.display as ipd`
move codestyle to black 2019-07-03 12:40:16 +00:00
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`aud = ipd.Audio(data, rate=16000)`
			`return aud`


			`def player_gen():`
			`audio_interface = pyaudio.PyAudio()`
move codestyle to black 2019-07-03 12:40:16 +00:00			`_audio_stream = audio_interface.open(`
			`format=pyaudio.paInt16,`
			`channels=1,`
			`rate=OUTPUT_SAMPLE_RATE,`
			`output=True,`
			`)`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00
			`def play_device(data):`
			`_audio_stream.write(data)`
			`# _audio_stream.close()`

			`return play_device`


			`def synthesize_corpus():`
			`tts_model = TTSModel()`
			`all_data = []`
move codestyle to black 2019-07-03 12:40:16 +00:00			`for (i, line) in enumerate(open("corpus.txt").readlines()):`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`print('synthesizing... "{}"'.format(line.strip()))`
			`data = tts_model.synth_speech(line.strip())`
			`all_data.append(data)`
			`return all_data`


			`def play_corpus(corpus_synths):`
			`player = player_gen()`
			`for d in corpus_synths:`
			`player(d)`


			`def main():`
			`corpus_synth_data = synthesize_corpus()`
			`play_corpus(corpus_synth_data)`
update comments 2019-07-05 09:03:04 +00:00			`import pdb`
move codestyle to black 2019-07-03 12:40:16 +00:00
update comments 2019-07-05 09:03:04 +00:00			`pdb.set_trace()`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00

move codestyle to black 2019-07-03 12:40:16 +00:00			`if __name__ == "__main__":`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`main()`