speech-scoring/speech_samplegen.py

253 lines
8.6 KiB
Python
Raw Normal View History

2017-10-04 12:21:24 +00:00
import objc
2017-10-25 08:06:41 +00:00
from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty
from AppKit import NSSpeechModePhoneme
from Foundation import NSURL
2017-10-04 12:21:24 +00:00
import json
import csv
import random
2017-10-04 12:21:24 +00:00
import os
import re
import subprocess
import time
2017-11-14 17:26:13 +00:00
from tqdm import tqdm
2017-10-04 12:21:24 +00:00
from generate_similar import similar_phoneme_phrase,similar_phrase
2017-12-28 14:31:44 +00:00
from speech_tools import hms_string,create_dir,format_filename,reservoir_sample
2017-10-26 12:36:14 +00:00
2017-12-28 14:31:44 +00:00
OUTPUT_NAME = 'test_5_words'
2017-10-25 08:06:41 +00:00
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
dest_file = './outputs/' + OUTPUT_NAME + '.csv'
2017-10-26 09:57:22 +00:00
def dest_filename(w, v, r, t):
rand_no = str(random.randint(0, 10000))
fname = '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, rand_no)
sanitized = format_filename(fname)
return sanitized
2017-10-25 08:06:41 +00:00
def dest_path(v, r, n):
2017-10-26 10:28:25 +00:00
rel = v + '/' + str(r) + '/' + n
2017-10-26 10:48:17 +00:00
return (dest_dir + rel), rel
2017-10-25 08:06:41 +00:00
def cli_gen_audio(speech_cmd, rate, voice, out_path):
subprocess.call(
['say', '-v', voice, '-r',
2017-11-14 17:26:13 +00:00
str(rate), '-o', out_path, "'"+speech_cmd+"'"])
2017-10-25 08:06:41 +00:00
class SynthFile(object):
"""docstring for SynthFile."""
2017-10-25 08:06:41 +00:00
2017-10-26 12:36:14 +00:00
def __init__(self, word, phon, filename, voice, voice_lang, rate, operation):
super(SynthFile, self).__init__()
self.word = word
self.phoneme = phon
self.filename = filename
self.voice = voice
2017-10-26 12:36:14 +00:00
self.voice_lang = voice_lang
self.rate = rate
self.variant = operation
2017-10-04 12:21:24 +00:00
def get_json(self):
2017-10-25 08:06:41 +00:00
return {
'filename': self.filename,
'voice': self.voice,
'rate': self.rate,
'operation': self.operation
}
def get_csv(self):
2017-10-26 12:36:14 +00:00
cols = [self.word, self.phoneme, self.voice,
self.voice_lang, self.rate, self.variant,
self.filename]
return ','.join([str(c) for c in cols])+'\n'
2017-10-25 08:06:41 +00:00
def get_values(self):
cols = [self.word, self.phoneme, self.voice,
self.voice_lang, self.rate, self.variant,
self.filename]
return [str(c) for c in cols]
class SynthVariant(object):
"""docstring for SynthVariant."""
2017-10-25 08:06:41 +00:00
2017-10-26 09:57:22 +00:00
def __init__(self, identifier, voice, lang, rate):
super(SynthVariant, self).__init__()
self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
self.synth.setVolume_(100)
self.synth.setRate_(rate)
2017-10-25 08:06:41 +00:00
self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(
identifier)
self.phone_synth.setVolume_(100)
self.phone_synth.setRate_(rate)
2017-10-25 08:06:41 +00:00
self.phone_synth.setObject_forProperty_error_(
NSSpeechModePhoneme, NSSpeechInputModeProperty, None)
self.identifier = identifier
self.rate = rate
2017-10-26 09:57:22 +00:00
self.name = voice
self.lang = lang
self.phoneme_capable = self.is_phoneme_capable()
2017-10-31 04:59:24 +00:00
def __repr__(self):
2017-10-26 09:57:22 +00:00
return 'Synthesizer[{} - {}]'.format(self.name, self.rate)
def is_phoneme_capable(self):
orig_phon = self.synth.phonemesFromText_('water')
return orig_phon != ''
def generate_audio(self, text, variant):
2017-10-25 08:06:41 +00:00
orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(
text), '', text
if variant == 'low':
# self.synth.startSpeakingString_toURL_(word,d_url)
phoneme = orig_phon
elif variant == 'medium':
phoneme = similar_phoneme_phrase(orig_phon)
2017-10-25 08:06:41 +00:00
phon_cmd = '[[inpt PHON]] ' + phoneme
elif variant == 'high':
phoneme = similar_phrase(text)
2017-10-27 13:23:22 +00:00
phon_cmd = phoneme
# elif variant == 'long':
2017-10-25 08:06:41 +00:00
# if phon != '':
# self.phone_synth.startSpeakingString_toURL_(phon,d_url)
# else:
# self.synth.startSpeakingString_toURL_(word,d_url)
fname = dest_filename(text, self.name, self.rate, variant)
2017-10-26 10:28:25 +00:00
d_path, r_path = dest_path(self.name, self.rate, fname)
2017-10-25 08:06:41 +00:00
# d_url = NSURL.fileURLWithPath_(d_path)
cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
return SynthFile(text, phoneme, r_path, self.name, self.lang, self.rate, variant)
2017-10-31 04:59:24 +00:00
def create_synth_dirs(self):
if self.phoneme_capable:
create_dir(dest_dir + self.name + '/' + str(self.rate))
2017-10-27 13:23:22 +00:00
@staticmethod
def voices_for_lang(lang):
voices_installed = NSSpeechSynthesizer.availableVoices()
voice_attrs = [
NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
]
# sk = [k for k in voice_attrs[0].keys() if k not in [
# 'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
# s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
# and 'VoiceRelativeDesirability' in v]
return [
(v['VoiceIdentifier'],
v['VoiceName'],
v['VoiceLanguage']) for v in voice_attrs
if v['VoiceLanguage'] == lang
and v['VoiceGender'] != 'VoiceGenderNeuter'
]
@classmethod
def synth_with(cls,voice_params,rate=180):
identifier,voice,lang = voice_params
return cls(identifier,voice,lang,rate)
def synth_generator():
2017-10-27 13:23:22 +00:00
us_voices_ids = SynthVariant.voices_for_lang('en-US')
2017-11-14 17:26:13 +00:00
voice_rates = [150, 180, 210]#, 250]
voice_synths = []
create_dir(dest_dir)
2017-10-27 13:23:22 +00:00
for vp in us_voices_ids:
for r in voice_rates:
2017-10-27 13:23:22 +00:00
s = SynthVariant.synth_with(vp,r)
if s.phoneme_capable:
print('Adding ', s)
voice_synths.append(s)
else:
print('Discarding phoneme incapable ', s)
2017-10-25 08:06:41 +00:00
2017-10-26 10:28:25 +00:00
def synth_for_words(words, writer):
start_time = time.time()
2017-10-26 10:48:17 +00:00
prog_title = "Synthesizing {} words : ".format(len(words))
for s in voice_synths:
2017-10-31 04:59:24 +00:00
s.create_synth_dirs()
for v in ['low', 'medium', 'high']:
2017-11-14 17:26:13 +00:00
prog = tqdm(words)
prog.set_postfix(variant=v,voice=s.name,rate=s.rate)
for w in tqdm(words):
2017-11-15 12:57:49 +00:00
prog.set_description('Synthesizing text:"{}"'.format(w))
2017-10-26 10:28:25 +00:00
synthed = s.generate_audio(w, v)
writer(synthed)
2017-11-15 12:57:49 +00:00
prog.close()
end_time = time.time()
time_str = hms_string(end_time - start_time)
print("It took {} to synthsize all variants.".format(time_str))
return synth_for_words
2017-11-14 17:26:13 +00:00
def synth_logger(fname, csv_mode=False):
2017-10-26 10:28:25 +00:00
f = open(fname, 'w')
s_csv_w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
2017-10-26 10:28:25 +00:00
def csv_writer(s):
s_csv_w.writerow(s.get_values())
2017-10-26 10:28:25 +00:00
synth_list = []
def json_writer(s):
synth_list.append(s)
def close_file():
2017-11-14 17:26:13 +00:00
if csv_mode:
2017-10-26 10:28:25 +00:00
f.close()
else:
json.dump([s.get_json() for s in synth_list], f)
f.close()
2017-11-14 17:26:13 +00:00
if csv_mode:
2017-10-26 10:28:25 +00:00
return csv_writer, close_file
else:
return json_writer, close_file
2017-11-02 07:44:08 +00:00
def generate_audio_for_text_list(text_list):
2017-11-14 17:26:13 +00:00
(writer, closer) = synth_logger(dest_file, csv_mode=True)
synth_for_texts = synth_generator()
2017-11-02 07:44:08 +00:00
try:
synth_for_texts(text_list, writer)
2017-11-02 07:44:08 +00:00
except:
import traceback
import sys
traceback.print_exc(file=sys.stdout)
pass
closer()
2017-10-26 10:28:25 +00:00
2017-10-04 12:21:24 +00:00
def generate_audio_for_stories():
'''
Generates the audio sample variants for the list of words in the stories
'''
2017-11-02 07:44:08 +00:00
# story_file = './inputs/all_stories_hs.json'
story_file = './inputs/all_stories.json'
2017-10-26 09:57:22 +00:00
stories_data = json.load(open(story_file))
2017-11-15 12:57:49 +00:00
# text_list_dup = [t[0] for i in stories_data.values() for t in i]
text_list_dup = [t for i in stories_data.values() for t in i]
text_list = sorted(list(set(text_list_dup)))
2017-11-15 13:00:43 +00:00
generate_audio_for_text_list(text_list)
2017-10-04 12:21:24 +00:00
2017-12-28 14:31:44 +00:00
def generate_test_audio_for_stories(sample_count=0):
'''
Picks a list of words from the wordlist that are not in story words
and generates the variants
'''
2017-11-07 04:53:31 +00:00
story_file = './inputs/all_stories_hs.json'
# story_file = './inputs/all_stories.json'
stories_data = json.load(open(story_file))
2017-11-15 12:57:49 +00:00
text_list_dup = [t[0] for i in stories_data.values() for t in i]
text_list = sorted(list(set(text_list_dup)))
2017-11-07 04:53:31 +00:00
# text_list = [i.replace('-','') for g in stories_data.values() for i in g]
word_list = [i.strip('\n_') for i in open('./inputs/wordlist.txt','r').readlines()]
text_set = set(text_list)
new_word_list = [i for i in word_list if i not in text_set and len(i) > 4]
2017-12-28 14:31:44 +00:00
# test_words = new_word_list[:int(len(text_list)/5+1)]
test_words = reservoir_sample(new_word_list,sample_count) if sample_count > 0 else new_word_list
2017-11-07 04:53:31 +00:00
generate_audio_for_text_list(test_words)
if __name__ == '__main__':
2017-12-28 14:31:44 +00:00
generate_test_audio_for_stories(5)
# generate_audio_for_text_list(['I want to go home','education'])
2017-12-28 14:31:44 +00:00
# generate_audio_for_stories()