speech-scoring/tts_samplegen.py

172 lines
5.6 KiB
Python

import objc
from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty
from AppKit import NSSpeechModePhoneme
from Foundation import NSURL
import json
import random
import os
import re
import subprocess
OUTPUT_NAME = 'audio'
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
dest_file = './outputs/' + OUTPUT_NAME + '.csv'
def create_dir(direc):
if not os.path.exists(direc):
os.mkdir(direc)
def dest_filename(n, v, r, t):
return '{}-{}-{}-{}-'.format(n, v, r,
t) + str(random.randint(0, 10000)) + '.aiff'
def dest_path(v, r, n):
return dest_dir + v + '/' + r + '/' + n
def cli_gen_audio(speech_cmd, rate, voice, out_path):
subprocess.call(
['say', '-v', voice, '-r',
str(rate), '-o', out_path, speech_cmd])
class SynthFile(object):
"""docstring for SynthFile."""
def __init__(self, word, phon, filename, voice, rate, operation):
super(SynthFile, self).__init__()
self.word = word
self.phoneme = phon
self.filename = filename
self.voice = voice
self.rate = rate
self.variant = operation
def get_json(self):
return {
'filename': self.filename,
'voice': self.voice,
'rate': self.rate,
'operation': self.operation
}
def get_csv(self):
return '{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
self.rate, self.variant,
self.filename)
class SynthVariant(object):
"""docstring for SynthVariant."""
def __init__(self, identifier, rate):
super(SynthVariant, self).__init__()
self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
self.synth.setVolume_(100)
self.synth.setRate_(rate)
self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(
identifier)
self.phone_synth.setVolume_(100)
self.phone_synth.setRate_(rate)
self.phone_synth.setObject_forProperty_error_(
NSSpeechModePhoneme, NSSpeechInputModeProperty, None)
self.identifier = identifier
self.rate = rate
self.name = identifier.split('.')[-1]
def __repr__(self):
return 'Synthesizer[{} - {}]({})'.format(self.name, self.rate)
def generate_audio(self, word, variant):
orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(
word), '', word
if variant == 'low':
# self.synth.startSpeakingString_toURL_(word,d_url)
phoneme = orig_phon
elif variant == 'medium':
phoneme = re.sub('[0-9]', '', orig_phon)
phon_cmd = '[[inpt PHON]] ' + phoneme
elif variant == 'high':
phoneme = orig_phon
phon_cmd = word
# elif variant == 'long':
# if phon != '':
# self.phone_synth.startSpeakingString_toURL_(phon,d_url)
# else:
# self.synth.startSpeakingString_toURL_(word,d_url)
fname = dest_filename(word, phoneme, self.name, self.rate)
d_path = dest_path(self.name, self.rate, fname)
# d_url = NSURL.fileURLWithPath_(d_path)
cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
return SynthFile(word, phoneme, fname, self.name, self.rate, variant)
def synth_generator():
voices_installed = NSSpeechSynthesizer.availableVoices()
voice_attrs = [
NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
]
us_voices_ids = [
v['VoiceIdentifier'] for v in voice_attrs
if v['VoiceLanguage'] == 'en-US'
and v['VoiceIdentifier'].split('.')[-1][0].isupper()
]
# us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',
# 'com.apple.speech.synthesis.voice.Alex',
# 'com.apple.speech.synthesis.voice.Victoria']
# voice_rates = list(range(150,221,(220-180)//4))
voice_rates = [150, 180, 210, 250]
voice_synths = []
create_dir(dest_dir)
for v in us_voices_ids:
for r in voice_rates:
create_dir(dest_dir + v + '/' + r)
voice_synths.append(SynthVariant(v, r))
def synth_for_words(words):
all_synths = []
for w in words:
for s in voice_synths:
for v in ['low', 'medium', 'high']:
all_synths.append(s.generate_audio(w, v))
return all_synths
return synth_for_words
def write_synths(synth_list, fname, csv=False):
f = open(fname, 'w')
if csv:
for s in synth_list:
f.write(s.get_csv())
else:
json.dump([s.get_json() for s in synth_list], f)
f.close()
def generate_audio_for_stories():
stories_data = json.load(open('./inputs/all_stories_hs.json'))
word_list = [t[0] for i in stories_data.values() for t in i]
words_audio_synth = synth_generator()
return words_audio_synth(word_list)
# words_audio_synth = synth_generator()
# synth = NSSpeechSynthesizer.alloc().init()
# voices_installed = NSSpeechSynthesizer.availableVoices()
# voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed]
# us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
# synth.setVoice_(us_voices_ids[2])
# synth.startSpeakingString_('your')
# fname = dest_filename(word,self.name,self.rate,self.operation)
# d_path = dest_path(fname)
# d_url = dest_url(d_path)
synths = synth_generator()([OUTPUT_NAME])
# synths = generate_audio_for_stories()
write_synths(synths, dest_file, True)
# write_synths(synths,'./outputs/synths.json')