speech-scoring/tts_samplegen.py

259 lines
8.1 KiB
Python

import objc
from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty
from AppKit import NSSpeechModePhoneme
from Foundation import NSURL
import json
import random
import os
import re
import subprocess
import time
import progressbar
from generate_similar import similar_phoneme_phrase,similar_phrase
OUTPUT_NAME = 'go_home'
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
dest_file = './outputs/' + OUTPUT_NAME + '.csv'
def hms_string(sec_elapsed):
h = int(sec_elapsed / (60 * 60))
m = int((sec_elapsed % (60 * 60)) / 60)
s = sec_elapsed % 60.
return "{}:{:>02}:{:>05.2f}".format(h, m, s)
def prog_bar(title):
widgets = [title, progressbar.Counter(), 'th entry - ', progressbar.FormatLabel(
''), ' [', progressbar.Bar(), '] - ', progressbar.ETA()]
prog = progressbar.ProgressBar(widgets=widgets)
def update_prog(current):
widgets[3] = progressbar.FormatLabel(current)
prog.update()
return (update_prog, prog)
def create_dir(direc):
if not os.path.exists(direc):
os.makedirs(direc)
def dest_filename(w, v, r, t):
return '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, str(random.randint(0, 10000)))
def dest_path(v, r, n):
rel = v + '/' + str(r) + '/' + n
return (dest_dir + rel), rel
def cli_gen_audio(speech_cmd, rate, voice, out_path):
subprocess.call(
['say', '-v', voice, '-r',
str(rate), '-o', out_path, speech_cmd])
class SynthFile(object):
"""docstring for SynthFile."""
def __init__(self, word, phon, filename, voice, voice_lang, rate, operation):
super(SynthFile, self).__init__()
self.word = word
self.phoneme = phon
self.filename = filename
self.voice = voice
self.voice_lang = voice_lang
self.rate = rate
self.variant = operation
def get_json(self):
return {
'filename': self.filename,
'voice': self.voice,
'rate': self.rate,
'operation': self.operation
}
def get_csv(self):
cols = [self.word, self.phoneme, self.voice,
self.voice_lang, self.rate, self.variant,
self.filename]
return ','.join([str(c) for c in cols])+'\n'
class SynthVariant(object):
"""docstring for SynthVariant."""
def __init__(self, identifier, voice, lang, rate):
super(SynthVariant, self).__init__()
self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
self.synth.setVolume_(100)
self.synth.setRate_(rate)
self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(
identifier)
self.phone_synth.setVolume_(100)
self.phone_synth.setRate_(rate)
self.phone_synth.setObject_forProperty_error_(
NSSpeechModePhoneme, NSSpeechInputModeProperty, None)
self.identifier = identifier
self.rate = rate
self.name = voice
self.lang = lang
self.phoneme_capable = self.is_phoneme_capable()
def __repr__(self):
return 'Synthesizer[{} - {}]'.format(self.name, self.rate)
def is_phoneme_capable(self):
orig_phon = self.synth.phonemesFromText_('water')
return orig_phon != ''
def generate_audio(self, text, variant):
orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(
text), '', text
if variant == 'low':
# self.synth.startSpeakingString_toURL_(word,d_url)
phoneme = orig_phon
elif variant == 'medium':
phoneme = similar_phoneme_phrase(orig_phon)
phon_cmd = '[[inpt PHON]] ' + phoneme
elif variant == 'high':
phoneme = similar_phrase(text)
phon_cmd = phoneme
# elif variant == 'long':
# if phon != '':
# self.phone_synth.startSpeakingString_toURL_(phon,d_url)
# else:
# self.synth.startSpeakingString_toURL_(word,d_url)
fname = dest_filename(text, self.name, self.rate, variant)
d_path, r_path = dest_path(self.name, self.rate, fname)
# d_url = NSURL.fileURLWithPath_(d_path)
cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
return SynthFile(text, phoneme, r_path, self.name, self.lang, self.rate, variant)
def create_synth_dirs(self):
if self.phoneme_capable:
create_dir(dest_dir + self.name + '/' + str(self.rate))
@staticmethod
def voices_for_lang(lang):
voices_installed = NSSpeechSynthesizer.availableVoices()
voice_attrs = [
NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
]
# sk = [k for k in voice_attrs[0].keys() if k not in [
# 'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
# s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
# and 'VoiceRelativeDesirability' in v]
return [
(v['VoiceIdentifier'],
v['VoiceName'],
v['VoiceLanguage']) for v in voice_attrs
if v['VoiceLanguage'] == lang
and v['VoiceGender'] != 'VoiceGenderNeuter'
]
@classmethod
def synth_with(cls,voice_params,rate=180):
identifier,voice,lang = voice_params
return cls(identifier,voice,lang,rate)
def synth_generator():
us_voices_ids = SynthVariant.voices_for_lang('en-US')
voice_rates = [150, 180, 210, 250]
voice_synths = []
create_dir(dest_dir)
for vp in us_voices_ids:
for r in voice_rates:
s = SynthVariant.synth_with(vp,r)
if s.phoneme_capable:
print('Adding ', s)
voice_synths.append(s)
else:
print('Discarding phoneme incapable ', s)
def synth_for_words(words, writer):
start_time = time.time()
prog_title = "Synthesizing {} words : ".format(len(words))
for s in voice_synths:
s.create_synth_dirs()
for v in ['low', 'medium', 'high']:
(update, prog) = prog_bar(prog_title)
for w in prog(words):
update('"{}" with {} variant ({})'.format(w, s, v))
synthed = s.generate_audio(w, v)
writer(synthed)
end_time = time.time()
time_str = hms_string(end_time - start_time)
print("It took {} to synthsize all variants.".format(time_str))
return synth_for_words
def write_synths(synth_list, fname, csv=False):
f = open(fname, 'w')
if csv:
for s in synth_list:
f.write(s.get_csv())
else:
json.dump([s.get_json() for s in synth_list], f)
f.close()
def synth_logger(fname, csv=False):
f = open(fname, 'w')
def csv_writer(s):
f.write(s.get_csv())
synth_list = []
def json_writer(s):
synth_list.append(s)
def close_file():
if csv:
f.close()
else:
json.dump([s.get_json() for s in synth_list], f)
f.close()
if csv:
return csv_writer, close_file
else:
return json_writer, close_file
def generate_audio_for_text_list(text_list):
(writer, closer) = synth_logger(dest_file, csv=True)
synth_for_texts = synth_generator()
try:
synth_for_texts(text_list, writer)
except:
import traceback
import sys
traceback.print_exc(file=sys.stdout)
pass
closer()
def generate_audio_for_stories():
# story_file = './inputs/all_stories_hs.json'
story_file = './inputs/all_stories.json'
stories_data = json.load(open(story_file))
# word_list = [t[0] for i in stories_data.values() for t in i]
text_list = [i for g in stories_data.values() for i in g]
generate_audio_for_text_list(text_list)
# (writer, closer) = synth_logger(dest_file, csv=True)
# synth_for_words = synth_generator()
# try:
# synth_for_words(word_list, writer)
# except:
# import traceback
# import sys
# traceback.print_exc(file=sys.stdout)
# pass
# closer()
if __name__ == '__main__':
generate_audio_for_text_list(['I want to go home','education'])
# generate_audio_for_stories()