tacotron2/final.py

125 lines
3.4 KiB
Python
Raw Normal View History

2019-06-28 04:16:46 +00:00
#!/usr/bin/env python
# coding: utf-8
# import matplotlib
# import matplotlib.pylab as plt
# import IPython.display as ipd
import sys
import numpy as np
import torch
from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
# from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
# from denoiser import Denoiser
import os
import soundfile as sf
2019-06-28 08:54:36 +00:00
import pyaudio
import klepto
import IPython.display as ipd
2019-06-28 04:16:46 +00:00
import time
sys.path.append('waveglow/')
hparams = create_hparams()
hparams.sampling_rate = 22050
checkpoint_path = "checkpoint_15000"
model = load_model(hparams)
model.load_state_dict(
torch.load(checkpoint_path, map_location='cpu')['state_dict'])
model.eval()
waveglow_path = 'waveglow_256channels.pt'
waveglow = torch.load(waveglow_path, map_location='cpu')['model']
waveglow.eval()
for k in waveglow.convinv:
k.float()
k_cache = klepto.archives.file_archive(cached=False)
2019-06-28 04:16:46 +00:00
# https://github.com/NVIDIA/waveglow/issues/127
for m in waveglow.modules():
if 'Conv' in str(type(m)):
setattr(m, 'padding_mode', 'zeros')
def convert(array):
sf.write('sample.wav', array, 22050)
os.system('ffmpeg -i {0} -filter:a "atempo=0.80" -ar 16k {1}'.format(
'sample.wav', 'sample0.wav'))
2019-06-28 04:16:46 +00:00
data, rate = sf.read('sample0.wav', dtype='int16')
os.remove('sample.wav')
os.remove('sample0.wav')
return data
@klepto.safe.inf_cache(cache=k_cache)
2019-06-28 04:16:46 +00:00
def speech(t):
start = time.time()
text = t
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long()
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
with torch.no_grad():
audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
2019-06-28 04:55:37 +00:00
# import ipdb; ipdb.set_trace()
2019-06-28 04:16:46 +00:00
data = convert(audio[0].data.cpu().numpy())
# _audio_stream.write(data.astype('float32'))
2019-06-28 04:55:37 +00:00
# _audio_stream.write(data)
2019-06-28 04:16:46 +00:00
end = time.time()
print(end - start)
2019-06-28 04:55:37 +00:00
return data
2019-06-28 04:16:46 +00:00
def display(data):
aud = ipd.Audio(data, rate=16000)
return aud
def player_gen():
2019-06-28 08:54:36 +00:00
audio_interface = pyaudio.PyAudio()
_audio_stream = audio_interface.open(format=pyaudio.paInt16,
channels=1,
rate=16000,
output=True)
def play_device(data):
_audio_stream.write(data.tostring())
# _audio_stream.close()
return play_device
def synthesize_corpus():
all_data = []
for line in open('corpus.txt').readlines():
print('synthesizing... "{}"'.format(line.strip()))
data = speech(line.strip())
all_data.append(data)
return all_data
def play_corpus(corpus_synths):
player = player_gen()
for d in corpus_synths:
player(d)
def main():
# data = speech('Hi I am Sia. How may I help you today .'.lower())
# audio_interface = pyaudio.PyAudio()
# _audio_stream = audio_interface.open(format=pyaudio.paInt16,
# channels=1,
# rate=16000,
# output=True)
# _audio_stream.write(data)
corpus_synth_data = synthesize_corpus()
play_corpus(corpus_synth_data)
import ipdb
ipdb.set_trace()
2019-06-28 04:16:46 +00:00
if __name__ == '__main__':
main()