speech-scoring/tts_samplegen.py

206 lines
7.0 KiB
Python
Raw Normal View History

2017-10-04 12:21:24 +00:00
import objc
2017-10-25 08:06:41 +00:00
from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty
from AppKit import NSSpeechModePhoneme
from Foundation import NSURL
2017-10-04 12:21:24 +00:00
import json
import random
2017-10-04 12:21:24 +00:00
import os
import re
import subprocess
2017-10-26 09:57:22 +00:00
import progressbar
2017-10-04 12:21:24 +00:00
2017-10-26 09:57:22 +00:00
OUTPUT_NAME = 'story_sents'
2017-10-25 08:06:41 +00:00
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
dest_file = './outputs/' + OUTPUT_NAME + '.csv'
2017-10-26 09:57:22 +00:00
def prog_bar(title):
widgets = [progressbar.FormatLabel(
title), ' [', progressbar.Bar(), '] - ', progressbar.ETA()]
prog = progressbar.ProgressBar(widgets=widgets)
def update_prog(current):
widgets[0] = progressbar.FormatLabel('{} : {}'.format(title, current))
prog.update()
return (update_prog, prog)
def create_dir(direc):
2017-10-05 11:24:41 +00:00
if not os.path.exists(direc):
2017-10-26 09:57:22 +00:00
os.makedirs(direc)
2017-10-04 12:21:24 +00:00
2017-10-25 08:06:41 +00:00
2017-10-26 09:57:22 +00:00
def dest_filename(w, v, r, t):
return '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, str(random.randint(0, 10000)))
2017-10-25 08:06:41 +00:00
def dest_path(v, r, n):
2017-10-26 09:57:22 +00:00
return dest_dir + v + '/' + str(r) + '/' + n
2017-10-25 08:06:41 +00:00
def cli_gen_audio(speech_cmd, rate, voice, out_path):
subprocess.call(
['say', '-v', voice, '-r',
str(rate), '-o', out_path, speech_cmd])
class SynthFile(object):
"""docstring for SynthFile."""
2017-10-25 08:06:41 +00:00
def __init__(self, word, phon, filename, voice, rate, operation):
super(SynthFile, self).__init__()
self.word = word
self.phoneme = phon
self.filename = filename
self.voice = voice
self.rate = rate
self.variant = operation
2017-10-04 12:21:24 +00:00
def get_json(self):
2017-10-25 08:06:41 +00:00
return {
'filename': self.filename,
'voice': self.voice,
'rate': self.rate,
'operation': self.operation
}
def get_csv(self):
2017-10-25 08:06:41 +00:00
return '{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
self.rate, self.variant,
self.filename)
class SynthVariant(object):
"""docstring for SynthVariant."""
2017-10-25 08:06:41 +00:00
2017-10-26 09:57:22 +00:00
def __init__(self, identifier, voice, lang, rate):
super(SynthVariant, self).__init__()
self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
self.synth.setVolume_(100)
self.synth.setRate_(rate)
2017-10-25 08:06:41 +00:00
self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(
identifier)
self.phone_synth.setVolume_(100)
self.phone_synth.setRate_(rate)
2017-10-25 08:06:41 +00:00
self.phone_synth.setObject_forProperty_error_(
NSSpeechModePhoneme, NSSpeechInputModeProperty, None)
self.identifier = identifier
self.rate = rate
2017-10-26 09:57:22 +00:00
self.name = voice
self.lang = lang
create_dir(dest_dir + self.name + '/' + str(self.rate))
def __repr__(self):
2017-10-26 09:57:22 +00:00
return 'Synthesizer[{} - {}]'.format(self.name, self.rate)
2017-10-25 08:06:41 +00:00
def generate_audio(self, word, variant):
orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(
word), '', word
if variant == 'low':
# self.synth.startSpeakingString_toURL_(word,d_url)
phoneme = orig_phon
elif variant == 'medium':
2017-10-25 08:06:41 +00:00
phoneme = re.sub('[0-9]', '', orig_phon)
phon_cmd = '[[inpt PHON]] ' + phoneme
elif variant == 'high':
phoneme = orig_phon
phon_cmd = word
# elif variant == 'long':
2017-10-25 08:06:41 +00:00
# if phon != '':
# self.phone_synth.startSpeakingString_toURL_(phon,d_url)
# else:
# self.synth.startSpeakingString_toURL_(word,d_url)
2017-10-26 09:57:22 +00:00
fname = dest_filename(word, self.name, self.rate, variant)
2017-10-25 08:06:41 +00:00
d_path = dest_path(self.name, self.rate, fname)
# d_url = NSURL.fileURLWithPath_(d_path)
cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
return SynthFile(word, phoneme, fname, self.name, self.rate, variant)
def synth_generator():
voices_installed = NSSpeechSynthesizer.availableVoices()
2017-10-25 08:06:41 +00:00
voice_attrs = [
NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
]
2017-10-26 09:57:22 +00:00
# sk = [k for k in voice_attrs[0].keys() if k not in [
# 'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
# s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
# and 'VoiceRelativeDesirability' in v]
2017-10-25 08:06:41 +00:00
us_voices_ids = [
2017-10-26 09:57:22 +00:00
(v['VoiceIdentifier'],
v['VoiceName'],
v['VoiceLanguage']) for v in voice_attrs
# v['VoiceDemoText'],
# v['VoiceShowInFullListOnly'],
# v['VoiceRelativeDesirability'])
2017-10-25 08:06:41 +00:00
if v['VoiceLanguage'] == 'en-US'
2017-10-26 09:57:22 +00:00
and v['VoiceGender'] != 'VoiceGenderNeuter'
# and v['VoiceIdentifier'].split('.')[-1][0].isupper()
# and 'VoiceShowInFullListOnly' in v
# and 'VoiceRelativeDesirability' in v
2017-10-25 08:06:41 +00:00
]
2017-10-26 09:57:22 +00:00
# import pdb
# pdb.set_trace()
2017-10-25 08:06:41 +00:00
# us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',
# 'com.apple.speech.synthesis.voice.Alex',
# 'com.apple.speech.synthesis.voice.Victoria']
# voice_rates = list(range(150,221,(220-180)//4))
2017-10-25 08:06:41 +00:00
voice_rates = [150, 180, 210, 250]
voice_synths = []
create_dir(dest_dir)
2017-10-26 09:57:22 +00:00
for (i, v, l) in us_voices_ids:
for r in voice_rates:
2017-10-26 09:57:22 +00:00
s = SynthVariant(i, v, l, r)
print('Created ', s)
voice_synths.append(s)
2017-10-25 08:06:41 +00:00
def synth_for_words(words):
all_synths = []
2017-10-26 09:57:22 +00:00
prog_title = "Synthesizing {} words, current word".format(len(words))
(update, prog) = prog_bar(prog_title)
for w in prog(words):
for s in voice_synths:
2017-10-25 08:06:41 +00:00
for v in ['low', 'medium', 'high']:
2017-10-26 09:57:22 +00:00
update('"{}" with {} variant ({})'.format(w, s, v))
2017-10-25 08:06:41 +00:00
all_synths.append(s.generate_audio(w, v))
return all_synths
2017-10-25 08:06:41 +00:00
return synth_for_words
2017-10-25 08:06:41 +00:00
def write_synths(synth_list, fname, csv=False):
f = open(fname, 'w')
if csv:
for s in synth_list:
f.write(s.get_csv())
else:
2017-10-25 08:06:41 +00:00
json.dump([s.get_json() for s in synth_list], f)
f.close()
2017-10-04 12:21:24 +00:00
2017-10-25 08:06:41 +00:00
2017-10-04 12:21:24 +00:00
def generate_audio_for_stories():
2017-10-26 09:57:22 +00:00
# story_file = './inputs/all_stories_hs.json'
story_file = './inputs/all_stories.json'
stories_data = json.load(open(story_file))
# word_list = [t[0] for i in stories_data.values() for t in i]
word_list = [i for g in stories_data.values() for i in g]
words_audio_synth = synth_generator()
return words_audio_synth(word_list)
2017-10-04 12:21:24 +00:00
2017-10-05 11:24:41 +00:00
# words_audio_synth = synth_generator()
# synth = NSSpeechSynthesizer.alloc().init()
# voices_installed = NSSpeechSynthesizer.availableVoices()
# voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed]
# us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
# synth.setVoice_(us_voices_ids[2])
# synth.startSpeakingString_('your')
# fname = dest_filename(word,self.name,self.rate,self.operation)
# d_path = dest_path(fname)
# d_url = dest_url(d_path)
2017-10-26 09:57:22 +00:00
# synths = synth_generator()([OUTPUT_NAME])
synths = generate_audio_for_stories()
2017-10-25 08:06:41 +00:00
write_synths(synths, dest_file, True)
# write_synths(synths,'./outputs/synths.json')