tacotron2/taco2/tts.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
import torch
import klepto
import argparse
import warnings
from pathlib import Path
from .model import Tacotron2
from glow import WaveGlow
from .hparams import HParams
from .layers import TacotronSTFT
from .text import text_to_sequence
from .denoiser import Denoiser
from .audio_processing import griffin_lim, postprocess_audio

OUTPUT_SAMPLE_RATE = 22050
GL_ITERS = 30
VOCODER_WAVEGLOW, VOCODER_GL = "wavglow", "gl"

# config from
# https://github.com/NVIDIA/waveglow/blob/master/config.json
WAVEGLOW_CONFIG = {
    "n_mel_channels": 40,
    "n_flows": 12,
    "n_group": 8,
    "n_early_every": 4,
    "n_early_size": 2,
    "WN_config": {"n_layers": 8, "n_channels": 256, "kernel_size": 3},
}


class TTSModel(object):
    """docstring for TTSModel."""

    def __init__(self, tacotron2_path, waveglow_path, **kwargs):
        super(TTSModel, self).__init__()
        hparams = HParams(**kwargs)
        self.hparams = hparams
        self.model = Tacotron2(hparams)
        if torch.cuda.is_available():
            self.model.load_state_dict(torch.load(tacotron2_path)["state_dict"])
            self.model.cuda().eval()
        else:
            self.model.load_state_dict(
                torch.load(tacotron2_path, map_location="cpu")["state_dict"]
            )
            self.model.eval()
        self.k_cache = klepto.archives.file_archive(cached=False)
        if waveglow_path:
            if torch.cuda.is_available():
                wave_params = torch.load(waveglow_path)
            else:
                wave_params = torch.load(waveglow_path, map_location="cpu")
            try:
                self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)
                self.waveglow.load_state_dict(wave_params)
            except:
                self.waveglow = wave_params["model"]
                self.waveglow = self.waveglow.remove_weightnorm(self.waveglow)
            if torch.cuda.is_available():
                self.waveglow.cuda().eval()
            else:
                self.waveglow.eval()
            # workaround from
            # https://github.com/NVIDIA/waveglow/issues/127
            for m in self.waveglow.modules():
                if "Conv" in str(type(m)):
                    setattr(m, "padding_mode", "zeros")
            for k in self.waveglow.convinv:
                k.float().half()
            self.denoiser = Denoiser(
                self.waveglow, n_mel_channels=hparams.n_mel_channels
            )
            self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
                self._synth_speech
            )
        else:
            self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(
                self._synth_speech_fast
            )
        self.taco_stft = TacotronSTFT(
            hparams.filter_length,
            hparams.hop_length,
            hparams.win_length,
            n_mel_channels=hparams.n_mel_channels,
            sampling_rate=hparams.sampling_rate,
            mel_fmax=4000,
        )

    def _generate_mel_postnet(self, text):
        sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :]
        if torch.cuda.is_available():
            sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
        else:
            sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
                sequence
            )
        return mel_outputs_postnet

    def synth_speech_array(self, text, vocoder):
        mel_outputs_postnet = self._generate_mel_postnet(text)

        if vocoder == VOCODER_WAVEGLOW:
            with torch.no_grad():
                audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)
                audio_t = self.denoiser(audio_t, 0.1)[0]
            audio = audio_t[0].data
        elif vocoder == VOCODER_GL:
            mel_decompress = self.taco_stft.spectral_de_normalize(mel_outputs_postnet)
            mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
            spec_from_mel_scaling = 1000
            spec_from_mel = torch.mm(mel_decompress[0], self.taco_stft.mel_basis)
            spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
            spec_from_mel = spec_from_mel * spec_from_mel_scaling
            spec_from_mel = (
                spec_from_mel.cuda() if torch.cuda.is_available() else spec_from_mel
            )
            audio = griffin_lim(
                torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                self.taco_stft.stft_fn,
                GL_ITERS,
            )
            audio = audio.squeeze()
        else:
            raise ValueError("vocoder arg should be one of [wavglow|gl]")
        audio = audio.cpu().numpy()
        return audio

    def _synth_speech(
        self, text, speed: float = 1.0, sample_rate: int = OUTPUT_SAMPLE_RATE
    ):
        audio = self.synth_speech_array(text, VOCODER_WAVEGLOW)

        return postprocess_audio(
            audio,
            src_rate=self.hparams.sampling_rate,
            dst_rate=sample_rate,
            tempo=speed,
        )

    def _synth_speech_fast(
        self, text, speed: float = 1.0, sample_rate: int = OUTPUT_SAMPLE_RATE
    ):
        audio = self.synth_speech_array(text, VOCODER_GL)

        return postprocess_audio(
            audio,
            tempo=speed,
            src_rate=self.hparams.sampling_rate,
            dst_rate=sample_rate,
        )


def player_gen():
    try:
        import pyaudio
    except ModuleNotFoundError:
        warnings.warn("module 'pyaudio' is not installed requried for playback")
        return
    audio_interface = pyaudio.PyAudio()
    _audio_stream = audio_interface.open(
        format=pyaudio.paInt16, channels=1, rate=OUTPUT_SAMPLE_RATE, output=True
    )

    def play_device(data):
        _audio_stream.write(data)
        # _audio_stream.close()

    return play_device


def repl(tts_model):
    player = player_gen()

    def loop():
        text = input("tts >")
        data = tts_model.synth_speech(text.strip())
        player(data)

    return loop


def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "-t",
        "--tacotron2_path",
        type=Path,
        default="./tacotron.pt",
        help="Path to a tacotron2 model",
    )
    parser.add_argument(
        "-w",
        "--waveglow_path",
        type=Path,
        default="./waveglow_256channels.pt",
        help="Path to a waveglow model",
    )
    args = parser.parse_args()
    tts_model = TTSModel(**vars(args))
    interactive_loop = repl(tts_model)
    while True:
        interactive_loop()


if __name__ == "__main__":
    main()
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`

			`import numpy as np`
			`import torch`
packaged taco2 2019-09-20 19:49:30 +00:00			`import klepto`
add tts cli args 2019-10-09 10:33:29 +00:00			`import argparse`
1. make pyaudio as extra requirement 2. warn if pyaudio not installed if player_gen is used 2020-01-22 08:22:59 +00:00			`import warnings`
add tts cli args 2019-10-09 10:33:29 +00:00			`from pathlib import Path`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`from .model import Tacotron2`
packaged taco2 2019-09-20 19:49:30 +00:00			`from glow import WaveGlow`
			`from .hparams import HParams`
1. implemented corpus wav generator 2. refactor 2019-10-07 10:30:35 +00:00			`from .layers import TacotronSTFT`
packaged taco2 2019-09-20 19:49:30 +00:00			`from .text import text_to_sequence`
1. add griffin lim support 2. add denoiser 3. add support to handle old and new waveglow models 2019-09-26 05:23:09 +00:00			`from .denoiser import Denoiser`
1. implemented corpus wav generator 2. refactor 2019-10-07 10:30:35 +00:00			`from .audio_processing import griffin_lim, postprocess_audio`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00
update tempo and output sample rate 2019-10-09 11:55:51 +00:00			`OUTPUT_SAMPLE_RATE = 22050`
update dependency version and add speed/sample rate and vocoder args 2019-11-28 11:27:36 +00:00			`GL_ITERS = 30`
1. fix synth_speech_fast interface 2. rename private methods 2019-11-28 12:37:47 +00:00			`VOCODER_WAVEGLOW, VOCODER_GL = "wavglow", "gl"`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00
update comments 2019-07-05 09:03:04 +00:00			`# config from`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`# https://github.com/NVIDIA/waveglow/blob/master/config.json`
			`WAVEGLOW_CONFIG = {`
1. add griffin lim support 2. add denoiser 3. add support to handle old and new waveglow models 2019-09-26 05:23:09 +00:00			`"n_mel_channels": 40,`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`"n_flows": 12,`
			`"n_group": 8,`
			`"n_early_every": 4,`
			`"n_early_size": 2,`
move codestyle to black 2019-07-03 12:40:16 +00:00			`"WN_config": {"n_layers": 8, "n_channels": 256, "kernel_size": 3},`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`}`


			`class TTSModel(object):`
			`"""docstring for TTSModel."""`

1. implemented corpus wav generator 2. refactor 2019-10-07 10:30:35 +00:00			`def __init__(self, tacotron2_path, waveglow_path, **kwargs):`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`super(TTSModel, self).__init__()`
1. implemented corpus wav generator 2. refactor 2019-10-07 10:30:35 +00:00			`hparams = HParams(**kwargs)`
update dependency version and add speed/sample rate and vocoder args 2019-11-28 11:27:36 +00:00			`self.hparams = hparams`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`self.model = Tacotron2(hparams)`
enable gpu support if available 2019-11-27 16:12:10 +00:00			`if torch.cuda.is_available():`
			`self.model.load_state_dict(torch.load(tacotron2_path)["state_dict"])`
			`self.model.cuda().eval()`
			`else:`
			`self.model.load_state_dict(`
			`torch.load(tacotron2_path, map_location="cpu")["state_dict"]`
			`)`
			`self.model.eval()`
1. add griffin lim support 2. add denoiser 3. add support to handle old and new waveglow models 2019-09-26 05:23:09 +00:00			`self.k_cache = klepto.archives.file_archive(cached=False)`
1. implemented corpus wav generator 2. refactor 2019-10-07 10:30:35 +00:00			`if waveglow_path:`
enable gpu support if available 2019-11-27 16:12:10 +00:00			`if torch.cuda.is_available():`
			`wave_params = torch.load(waveglow_path)`
			`else:`
			`wave_params = torch.load(waveglow_path, map_location="cpu")`
1. implemented corpus wav generator 2. refactor 2019-10-07 10:30:35 +00:00			`try:`
			`self.waveglow = WaveGlow(**WAVEGLOW_CONFIG)`
			`self.waveglow.load_state_dict(wave_params)`
			`except:`
			`self.waveglow = wave_params["model"]`
			`self.waveglow = self.waveglow.remove_weightnorm(self.waveglow)`
enable gpu support if available 2019-11-27 16:12:10 +00:00			`if torch.cuda.is_available():`
			`self.waveglow.cuda().eval()`
			`else:`
1. implemented corpus wav generator 2. refactor 2019-10-07 10:30:35 +00:00			`self.waveglow.eval()`
			`# workaround from`
			`# https://github.com/NVIDIA/waveglow/issues/127`
			`for m in self.waveglow.modules():`
			`if "Conv" in str(type(m)):`
			`setattr(m, "padding_mode", "zeros")`
			`for k in self.waveglow.convinv:`
enable gpu support if available 2019-11-27 16:12:10 +00:00			`k.float().half()`
1. implemented corpus wav generator 2. refactor 2019-10-07 10:30:35 +00:00			`self.denoiser = Denoiser(`
			`self.waveglow, n_mel_channels=hparams.n_mel_channels`
			`)`
			`self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(`
1. fix synth_speech_fast interface 2. rename private methods 2019-11-28 12:37:47 +00:00			`self._synth_speech`
1. implemented corpus wav generator 2. refactor 2019-10-07 10:30:35 +00:00			`)`
			`else:`
			`self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(`
1. fix synth_speech_fast interface 2. rename private methods 2019-11-28 12:37:47 +00:00			`self._synth_speech_fast`
1. implemented corpus wav generator 2. refactor 2019-10-07 10:30:35 +00:00			`)`
			`self.taco_stft = TacotronSTFT(`
			`hparams.filter_length,`
			`hparams.hop_length,`
			`hparams.win_length,`
			`n_mel_channels=hparams.n_mel_channels,`
			`sampling_rate=hparams.sampling_rate,`
			`mel_fmax=4000,`
			`)`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00
1. fix synth_speech_fast interface 2. rename private methods 2019-11-28 12:37:47 +00:00			`def _generate_mel_postnet(self, text):`
packaged taco2 2019-09-20 19:49:30 +00:00			`sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :]`
enable gpu support if available 2019-11-27 16:12:10 +00:00			`if torch.cuda.is_available():`
			`sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()`
			`else:`
			`sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()`
1. implemented corpus wav generator 2. refactor 2019-10-07 10:30:35 +00:00			`with torch.no_grad():`
			`mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(`
			`sequence`
			`)`
			`return mel_outputs_postnet`

update dependency version and add speed/sample rate and vocoder args 2019-11-28 11:27:36 +00:00			`def synth_speech_array(self, text, vocoder):`
1. fix synth_speech_fast interface 2. rename private methods 2019-11-28 12:37:47 +00:00			`mel_outputs_postnet = self._generate_mel_postnet(text)`
1. implemented corpus wav generator 2. refactor 2019-10-07 10:30:35 +00:00
1. fix synth_speech_fast interface 2. rename private methods 2019-11-28 12:37:47 +00:00			`if vocoder == VOCODER_WAVEGLOW:`
update dependency version and add speed/sample rate and vocoder args 2019-11-28 11:27:36 +00:00			`with torch.no_grad():`
			`audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666)`
			`audio_t = self.denoiser(audio_t, 0.1)[0]`
tested gl/wavglow working 2019-11-28 12:22:05 +00:00			`audio = audio_t[0].data`
1. fix synth_speech_fast interface 2. rename private methods 2019-11-28 12:37:47 +00:00			`elif vocoder == VOCODER_GL:`
update dependency version and add speed/sample rate and vocoder args 2019-11-28 11:27:36 +00:00			`mel_decompress = self.taco_stft.spectral_de_normalize(mel_outputs_postnet)`
			`mel_decompress = mel_decompress.transpose(1, 2).data.cpu()`
			`spec_from_mel_scaling = 1000`
			`spec_from_mel = torch.mm(mel_decompress[0], self.taco_stft.mel_basis)`
			`spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)`
			`spec_from_mel = spec_from_mel * spec_from_mel_scaling`
tested gl/wavglow working 2019-11-28 12:22:05 +00:00			`spec_from_mel = (`
			`spec_from_mel.cuda() if torch.cuda.is_available() else spec_from_mel`
			`)`
update dependency version and add speed/sample rate and vocoder args 2019-11-28 11:27:36 +00:00			`audio = griffin_lim(`
			`torch.autograd.Variable(spec_from_mel[:, :, :-1]),`
			`self.taco_stft.stft_fn,`
1. fix synth_speech_fast interface 2. rename private methods 2019-11-28 12:37:47 +00:00			`GL_ITERS,`
update dependency version and add speed/sample rate and vocoder args 2019-11-28 11:27:36 +00:00			`)`
			`audio = audio.squeeze()`
			`else:`
			`raise ValueError("vocoder arg should be one of [wavglow\|gl]")`
tested gl/wavglow working 2019-11-28 12:22:05 +00:00			`audio = audio.cpu().numpy()`
enable gpu support if available 2019-11-27 16:12:10 +00:00			`return audio`

1. fix synth_speech_fast interface 2. rename private methods 2019-11-28 12:37:47 +00:00			`def _synth_speech(`
tested gl/wavglow working 2019-11-28 12:22:05 +00:00			`self, text, speed: float = 1.0, sample_rate: int = OUTPUT_SAMPLE_RATE`
			`):`
1. fix synth_speech_fast interface 2. rename private methods 2019-11-28 12:37:47 +00:00			`audio = self.synth_speech_array(text, VOCODER_WAVEGLOW)`
1. clean-up 2. update readme and release info 2019-10-04 10:45:30 +00:00
1. implemented corpus wav generator 2. refactor 2019-10-07 10:30:35 +00:00			`return postprocess_audio(`
1. fix synth_speech_fast interface 2. rename private methods 2019-11-28 12:37:47 +00:00			`audio,`
			`src_rate=self.hparams.sampling_rate,`
			`dst_rate=sample_rate,`
			`tempo=speed,`
1. clean-up 2. update readme and release info 2019-10-04 10:45:30 +00:00			`)`
1. implemented corpus wav generator 2. refactor 2019-10-07 10:30:35 +00:00
1. fix synth_speech_fast interface 2. rename private methods 2019-11-28 12:37:47 +00:00			`def _synth_speech_fast(`
tested gl/wavglow working 2019-11-28 12:22:05 +00:00			`self, text, speed: float = 1.0, sample_rate: int = OUTPUT_SAMPLE_RATE`
			`):`
1. fix synth_speech_fast interface 2. rename private methods 2019-11-28 12:37:47 +00:00			`audio = self.synth_speech_array(text, VOCODER_GL)`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00
1. implemented corpus wav generator 2. refactor 2019-10-07 10:30:35 +00:00			`return postprocess_audio(`
1. fix synth_speech_fast interface 2. rename private methods 2019-11-28 12:37:47 +00:00			`audio,`
			`tempo=speed,`
			`src_rate=self.hparams.sampling_rate,`
			`dst_rate=sample_rate,`
1. implemented corpus wav generator 2. refactor 2019-10-07 10:30:35 +00:00			`)`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00

			`def player_gen():`
1. make pyaudio as extra requirement 2. warn if pyaudio not installed if player_gen is used 2020-01-22 08:22:59 +00:00			`try:`
			`import pyaudio`
			`except ModuleNotFoundError:`
			`warnings.warn("module 'pyaudio' is not installed requried for playback")`
			`return`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`audio_interface = pyaudio.PyAudio()`
move codestyle to black 2019-07-03 12:40:16 +00:00			`_audio_stream = audio_interface.open(`
packaged taco2 2019-09-20 19:49:30 +00:00			`format=pyaudio.paInt16, channels=1, rate=OUTPUT_SAMPLE_RATE, output=True`
move codestyle to black 2019-07-03 12:40:16 +00:00			`)`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00
			`def play_device(data):`
			`_audio_stream.write(data)`
			`# _audio_stream.close()`

			`return play_device`


add tts cli args 2019-10-09 10:33:29 +00:00			`def repl(tts_model):`
packaged taco2 2019-09-20 19:49:30 +00:00			`player = player_gen()`
1. clean-up 2. update readme and release info 2019-10-04 10:45:30 +00:00
packaged taco2 2019-09-20 19:49:30 +00:00			`def loop():`
1. clean-up 2. update readme and release info 2019-10-04 10:45:30 +00:00			`text = input("tts >")`
packaged taco2 2019-09-20 19:49:30 +00:00			`data = tts_model.synth_speech(text.strip())`
			`player(data)`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00
1. clean-up 2. update readme and release info 2019-10-04 10:45:30 +00:00			`return loop`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00

			`def main():`
add tts cli args 2019-10-09 10:33:29 +00:00			`parser = argparse.ArgumentParser(`
			`formatter_class=argparse.ArgumentDefaultsHelpFormatter`
			`)`
			`parser.add_argument(`
			`"-t",`
			`"--tacotron2_path",`
			`type=Path,`
			`default="./tacotron.pt",`
			`help="Path to a tacotron2 model",`
			`)`
			`parser.add_argument(`
			`"-w",`
			`"--waveglow_path",`
			`type=Path,`
			`default="./waveglow_256channels.pt",`
			`help="Path to a waveglow model",`
			`)`
			`args = parser.parse_args()`
			`tts_model = TTSModel(**vars(args))`
			`interactive_loop = repl(tts_model)`
packaged taco2 2019-09-20 19:49:30 +00:00			`while True:`
			`interactive_loop()`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00

move codestyle to black 2019-07-03 12:40:16 +00:00			`if __name__ == "__main__":`
cleanup unused code and fix packaging issues 2019-07-03 11:56:12 +00:00			`main()`