mirror of https://github.com/malarinv/tacotron2
parent
36c731cad0
commit
edbb1bd57d
|
|
@ -2,7 +2,12 @@
|
||||||
History
|
History
|
||||||
=======
|
=======
|
||||||
|
|
||||||
|
0.2.0 (2019-10-04)
|
||||||
|
------------------
|
||||||
|
* Add Griffin Lim support
|
||||||
|
* Allow passing hyper-parameters to TTSModel.
|
||||||
|
|
||||||
|
|
||||||
0.1.0 (2019-09-20)
|
0.1.0 (2019-09-20)
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
* First release on PyPI.
|
* First release on PyPI.
|
||||||
|
|
|
||||||
21
README.md
21
README.md
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
[](https://github.com/python/black)
|
[](https://github.com/python/black)
|
||||||
|
|
||||||
> Generate speech audio from text
|
> Generates speech audio from text
|
||||||
---
|
---
|
||||||
|
|
||||||
# Table of Contents
|
# Table of Contents
|
||||||
|
|
@ -13,22 +13,25 @@
|
||||||
|
|
||||||
# Features
|
# Features
|
||||||
|
|
||||||
* Tacotron2 Synthesized Speech
|
* TTS using Tacotron2
|
||||||
|
|
||||||
|
|
||||||
# Installation
|
# Installation
|
||||||
Install the packages with for production use. It downloads the dependencies
|
To install the packages and its dependencies run.
|
||||||
```bash
|
```bash
|
||||||
python setup.py install
|
python setup.py install
|
||||||
```
|
```
|
||||||
|
or with pip
|
||||||
|
```bash
|
||||||
|
pip install .
|
||||||
|
```
|
||||||
|
|
||||||
> Still facing an issue? Check the [Issues](#issues) section or open a new issue.
|
The installation should work on Python 3.6 or newer. Untested on Python 2.7
|
||||||
|
|
||||||
The installation should be smooth with Python 3.6 or newer.
|
|
||||||
|
|
||||||
# Usage
|
# Usage
|
||||||
> API
|
|
||||||
```python
|
```python
|
||||||
tts_model = TTSModel("/path/to/tacotron2_model","/path/to/waveglow_model")
|
from taco2.tts import TTSModel
|
||||||
SPEECH_AUDIO = tts_model.synth_speech(TEXT)
|
tts_model = TTSModel("/path/to/tacotron2_model","/path/to/waveglow_model") # Loads the models
|
||||||
|
SPEECH_AUDIO = tts_model.synth_speech(TEXT) # Returns the wav buffer
|
||||||
```
|
```
|
||||||
|
If `'/path/to/waveglow_model'` is `None` *Griffin-Lim vocoder* will be used.
|
||||||
|
|
|
||||||
8
setup.py
8
setup.py
|
|
@ -29,7 +29,7 @@ packages = find_packages()
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
author="Malar Kannan",
|
author="Malar Kannan",
|
||||||
author_email="malar@agaralabs.com",
|
author_email="malarkannan.invention@gmail.com",
|
||||||
classifiers=[
|
classifiers=[
|
||||||
"Development Status :: 2 - Pre-Alpha",
|
"Development Status :: 2 - Pre-Alpha",
|
||||||
"Intended Audience :: Developers",
|
"Intended Audience :: Developers",
|
||||||
|
|
@ -46,14 +46,14 @@ setup(
|
||||||
install_requires=requirements,
|
install_requires=requirements,
|
||||||
long_description=readme + "\n\n" + history,
|
long_description=readme + "\n\n" + history,
|
||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
keywords="tacotron2",
|
keywords="tacotron2 tts",
|
||||||
name="taco2-tts",
|
name="taco2-tts",
|
||||||
packages=packages,
|
packages=packages,
|
||||||
setup_requires=setup_requirements,
|
setup_requires=setup_requirements,
|
||||||
test_suite="tests",
|
test_suite="tests",
|
||||||
tests_require=test_requirements,
|
tests_require=test_requirements,
|
||||||
url="https://github.com/malarinv/tacotron2",
|
url="https://github.com/malarinv/tacotron2",
|
||||||
version="0.1.0",
|
version="0.2.0",
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
entry_points={"console_scripts": ("tts_debug = tts:main",)},
|
entry_points={"console_scripts": ("tts_debug = taco2.tts:main",)},
|
||||||
)
|
)
|
||||||
|
|
|
||||||
67
taco2/tts.py
67
taco2/tts.py
|
|
@ -46,7 +46,7 @@ class TTSModel(object):
|
||||||
self.waveglow.load_state_dict(wave_params)
|
self.waveglow.load_state_dict(wave_params)
|
||||||
self.waveglow.eval()
|
self.waveglow.eval()
|
||||||
except:
|
except:
|
||||||
self.waveglow = wave_params['model']
|
self.waveglow = wave_params["model"]
|
||||||
self.waveglow = self.waveglow.remove_weightnorm(self.waveglow)
|
self.waveglow = self.waveglow.remove_weightnorm(self.waveglow)
|
||||||
self.waveglow.eval()
|
self.waveglow.eval()
|
||||||
# workaround from
|
# workaround from
|
||||||
|
|
@ -60,7 +60,6 @@ class TTSModel(object):
|
||||||
self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(self.synth_speech)
|
self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)(self.synth_speech)
|
||||||
self.denoiser = Denoiser(self.waveglow)
|
self.denoiser = Denoiser(self.waveglow)
|
||||||
|
|
||||||
|
|
||||||
def synth_speech(self, text):
|
def synth_speech(self, text):
|
||||||
sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :]
|
sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :]
|
||||||
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
|
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
|
||||||
|
|
@ -78,15 +77,23 @@ class TTSModel(object):
|
||||||
data = float2pcm(float_data)
|
data = float2pcm(float_data)
|
||||||
return data.tobytes()
|
return data.tobytes()
|
||||||
|
|
||||||
def synth_speech_algo(self,text,griffin_iters=60):
|
def synth_speech_algo(self, text, griffin_iters=60):
|
||||||
sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :]
|
sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :]
|
||||||
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
|
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
|
||||||
mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(sequence)
|
mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(sequence)
|
||||||
from .hparams import HParams
|
from .hparams import HParams
|
||||||
from .layers import TacotronSTFT
|
from .layers import TacotronSTFT
|
||||||
from .audio_processing import griffin_lim
|
from .audio_processing import griffin_lim
|
||||||
|
|
||||||
hparams = HParams()
|
hparams = HParams()
|
||||||
taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, n_mel_channels=hparams.n_mel_channels, sampling_rate=hparams.sampling_rate, mel_fmax=4000)
|
taco_stft = TacotronSTFT(
|
||||||
|
hparams.filter_length,
|
||||||
|
hparams.hop_length,
|
||||||
|
hparams.win_length,
|
||||||
|
n_mel_channels=hparams.n_mel_channels,
|
||||||
|
sampling_rate=hparams.sampling_rate,
|
||||||
|
mel_fmax=4000,
|
||||||
|
)
|
||||||
mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
|
mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
|
||||||
mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
|
mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
|
||||||
spec_from_mel_scaling = 1000
|
spec_from_mel_scaling = 1000
|
||||||
|
|
@ -94,7 +101,11 @@ class TTSModel(object):
|
||||||
spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
|
spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
|
||||||
spec_from_mel = spec_from_mel * spec_from_mel_scaling
|
spec_from_mel = spec_from_mel * spec_from_mel_scaling
|
||||||
|
|
||||||
audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, griffin_iters)
|
audio = griffin_lim(
|
||||||
|
torch.autograd.Variable(spec_from_mel[:, :, :-1]),
|
||||||
|
taco_stft.stft_fn,
|
||||||
|
griffin_iters,
|
||||||
|
)
|
||||||
audio = audio.squeeze()
|
audio = audio.squeeze()
|
||||||
audio = audio.cpu().numpy()
|
audio = audio.cpu().numpy()
|
||||||
|
|
||||||
|
|
@ -102,6 +113,8 @@ class TTSModel(object):
|
||||||
float_data = resample(slow_data, TTS_SAMPLE_RATE, OUTPUT_SAMPLE_RATE)
|
float_data = resample(slow_data, TTS_SAMPLE_RATE, OUTPUT_SAMPLE_RATE)
|
||||||
data = float2pcm(float_data)
|
data = float2pcm(float_data)
|
||||||
return data.tobytes()
|
return data.tobytes()
|
||||||
|
|
||||||
|
|
||||||
# adapted from
|
# adapted from
|
||||||
# https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
|
# https://github.com/mgeier/python-audio/blob/master/audio-files/utility.py
|
||||||
def float2pcm(sig, dtype="int16"):
|
def float2pcm(sig, dtype="int16"):
|
||||||
|
|
@ -140,13 +153,6 @@ def float2pcm(sig, dtype="int16"):
|
||||||
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
|
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
|
||||||
|
|
||||||
|
|
||||||
def display(data):
|
|
||||||
import IPython.display as ipd
|
|
||||||
|
|
||||||
aud = ipd.Audio(data, rate=16000)
|
|
||||||
return aud
|
|
||||||
|
|
||||||
|
|
||||||
def player_gen():
|
def player_gen():
|
||||||
audio_interface = pyaudio.PyAudio()
|
audio_interface = pyaudio.PyAudio()
|
||||||
_audio_stream = audio_interface.open(
|
_audio_stream = audio_interface.open(
|
||||||
|
|
@ -160,51 +166,22 @@ def player_gen():
|
||||||
return play_device
|
return play_device
|
||||||
|
|
||||||
|
|
||||||
def synthesize_corpus():
|
|
||||||
tts_model = TTSModel(
|
|
||||||
"/Users/malar/Work/tacotron2_statedict.pt",
|
|
||||||
"/Users/malar/Work/waveglow.pt",
|
|
||||||
)
|
|
||||||
all_data = []
|
|
||||||
for (i, line) in enumerate(open("corpus.txt").readlines()):
|
|
||||||
print(f'synthesizing... "{line.strip()}"')
|
|
||||||
data = tts_model.synth_speech(line.strip())
|
|
||||||
all_data.append(data)
|
|
||||||
return all_data
|
|
||||||
|
|
||||||
def repl():
|
def repl():
|
||||||
tts_model = TTSModel(
|
tts_model = TTSModel("/path/to/tacotron2.pt", "/path/to/waveglow.pt")
|
||||||
# "/Users/malar/Work/tacotron2_statedict.pt",
|
|
||||||
# "/Users/malar/Work/tacotron2_80_22000.pt",
|
|
||||||
"/path/to/tacotron2.pt",
|
|
||||||
# "/Users/malar/Work/tacotron2_40_22000.pt",
|
|
||||||
# "/Users/malar/Work/tacotron2_16000.pt",
|
|
||||||
"/path/to/waveglow.pt",
|
|
||||||
# "/Users/malar/Work/waveglow.pt",
|
|
||||||
# "/Users/malar/Work/waveglow_38000",
|
|
||||||
)
|
|
||||||
player = player_gen()
|
player = player_gen()
|
||||||
|
|
||||||
def loop():
|
def loop():
|
||||||
text = input('tts >')
|
text = input("tts >")
|
||||||
data = tts_model.synth_speech(text.strip())
|
data = tts_model.synth_speech(text.strip())
|
||||||
player(data)
|
player(data)
|
||||||
|
|
||||||
return loop
|
return loop
|
||||||
|
|
||||||
|
|
||||||
def play_corpus(corpus_synths):
|
|
||||||
player = player_gen()
|
|
||||||
for d in corpus_synths:
|
|
||||||
player(d)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# corpus_synth_data = synthesize_corpus()
|
|
||||||
# play_corpus(corpus_synth_data)
|
|
||||||
interactive_loop = repl()
|
interactive_loop = repl()
|
||||||
while True:
|
while True:
|
||||||
interactive_loop()
|
interactive_loop()
|
||||||
# import pdb
|
|
||||||
# pdb.set_trace()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue