Compare commits

...

5 Commits

Author SHA1 Message Date
Malar Kannan eb3ce8b7e5 wip high variant phoneme 2017-10-26 18:06:14 +05:30
Malar Kannan e57576d6fa discarding phoneme incapable synthesizers 2017-10-26 16:51:32 +05:30
Malar Kannan a953fa3355 fixed progress 2017-10-26 16:18:17 +05:30
Malar Kannan 7a520b79f4 writing to csv proactively 2017-10-26 15:58:25 +05:30
Malar Kannan 05f36daf7e refactored sample generation code 2017-10-26 15:27:22 +05:30
3 changed files with 121 additions and 47 deletions

2
.gitignore vendored
View File

@ -138,7 +138,7 @@ Temporary Items
# End of https://www.gitignore.io/api/macos # End of https://www.gitignore.io/api/macos
outputs/* outputs/*
inputs/mnist inputs/*
inputs/audio* inputs/audio*
logs/* logs/*
models/* models/*

View File

@ -2,3 +2,10 @@
1. create spectrograms of 150ms windows with 50ms overlap for each word. 1. create spectrograms of 150ms windows with 50ms overlap for each word.
2. train a rnn to output a vector using the spectrograms 2. train a rnn to output a vector using the spectrograms
3. train a nn to output True/False based on the acceptability of the rnn output. -> Siamese network(implementation detail) 3. train a nn to output True/False based on the acceptability of the rnn output. -> Siamese network(implementation detail)
4. validate with real world samples
same word spoken by multiple people etc. will be low distance. two words which are very different (you can use similarity measure given in the speech_recognition repo) will have high distance.
the one with wrong pronunciation will have medium distance from one with right pronunciation
i also had good experience with getting non-English voices to speak out the English words to get "wrong" pronunciation - so that will be subtly different too.

View File

@ -7,24 +7,38 @@ import random
import os import os
import re import re
import subprocess import subprocess
import progressbar
OUTPUT_NAME = 'audio' from generate_similar import similar_phonemes
OUTPUT_NAME = 'story_sents'
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/' dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
dest_file = './outputs/' + OUTPUT_NAME + '.csv' dest_file = './outputs/' + OUTPUT_NAME + '.csv'
def prog_bar(title):
widgets = [title, progressbar.Counter(), 'th entry - ', progressbar.FormatLabel(
''), ' [', progressbar.Bar(), '] - ', progressbar.ETA()]
prog = progressbar.ProgressBar(widgets=widgets)
def update_prog(current):
widgets[3] = progressbar.FormatLabel(current)
prog.update()
return (update_prog, prog)
def create_dir(direc): def create_dir(direc):
if not os.path.exists(direc): if not os.path.exists(direc):
os.mkdir(direc) os.makedirs(direc)
def dest_filename(n, v, r, t): def dest_filename(w, v, r, t):
return '{}-{}-{}-{}-'.format(n, v, r, return '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, str(random.randint(0, 10000)))
t) + str(random.randint(0, 10000)) + '.aiff'
def dest_path(v, r, n): def dest_path(v, r, n):
return dest_dir + v + '/' + r + '/' + n rel = v + '/' + str(r) + '/' + n
return (dest_dir + rel), rel
def cli_gen_audio(speech_cmd, rate, voice, out_path): def cli_gen_audio(speech_cmd, rate, voice, out_path):
@ -36,12 +50,13 @@ def cli_gen_audio(speech_cmd, rate, voice, out_path):
class SynthFile(object): class SynthFile(object):
"""docstring for SynthFile.""" """docstring for SynthFile."""
def __init__(self, word, phon, filename, voice, rate, operation): def __init__(self, word, phon, filename, voice, voice_lang, rate, operation):
super(SynthFile, self).__init__() super(SynthFile, self).__init__()
self.word = word self.word = word
self.phoneme = phon self.phoneme = phon
self.filename = filename self.filename = filename
self.voice = voice self.voice = voice
self.voice_lang = voice_lang
self.rate = rate self.rate = rate
self.variant = operation self.variant = operation
@ -54,15 +69,21 @@ class SynthFile(object):
} }
def get_csv(self): def get_csv(self):
return '{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice, cols = [self.word, self.phoneme, self.voice,
self.rate, self.variant, self.voice_lang, self.rate, self.variant,
self.filename) self.filename]
return ','.join([str(c) for c in cols])+'\n'
# return '{},{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
# self.voice_lang, self.rate, self.variant,
# self.filename)
class SynthVariant(object): class SynthVariant(object):
"""docstring for SynthVariant.""" """docstring for SynthVariant."""
def __init__(self, identifier, rate): def __init__(self, identifier, voice, lang, rate):
super(SynthVariant, self).__init__() super(SynthVariant, self).__init__()
self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier) self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
self.synth.setVolume_(100) self.synth.setVolume_(100)
@ -75,10 +96,18 @@ class SynthVariant(object):
NSSpeechModePhoneme, NSSpeechInputModeProperty, None) NSSpeechModePhoneme, NSSpeechInputModeProperty, None)
self.identifier = identifier self.identifier = identifier
self.rate = rate self.rate = rate
self.name = identifier.split('.')[-1] self.name = voice
self.lang = lang
self.phoneme_capable = self.is_phoneme_capable()
if self.phoneme_capable:
create_dir(dest_dir + self.name + '/' + str(self.rate))
def __repr__(self): def __repr__(self):
return 'Synthesizer[{} - {}]({})'.format(self.name, self.rate) return 'Synthesizer[{} - {}]'.format(self.name, self.rate)
def is_phoneme_capable(self):
orig_phon = self.synth.phonemesFromText_('water')
return orig_phon != ''
def generate_audio(self, word, variant): def generate_audio(self, word, variant):
orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_( orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(
@ -91,17 +120,17 @@ class SynthVariant(object):
phon_cmd = '[[inpt PHON]] ' + phoneme phon_cmd = '[[inpt PHON]] ' + phoneme
elif variant == 'high': elif variant == 'high':
phoneme = orig_phon phoneme = orig_phon
phon_cmd = word phon_cmd = '[[inpt PHON]] ' + phoneme
# elif variant == 'long': # elif variant == 'long':
# if phon != '': # if phon != '':
# self.phone_synth.startSpeakingString_toURL_(phon,d_url) # self.phone_synth.startSpeakingString_toURL_(phon,d_url)
# else: # else:
# self.synth.startSpeakingString_toURL_(word,d_url) # self.synth.startSpeakingString_toURL_(word,d_url)
fname = dest_filename(word, phoneme, self.name, self.rate) fname = dest_filename(word, self.name, self.rate, variant)
d_path = dest_path(self.name, self.rate, fname) d_path, r_path = dest_path(self.name, self.rate, fname)
# d_url = NSURL.fileURLWithPath_(d_path) # d_url = NSURL.fileURLWithPath_(d_path)
cli_gen_audio(phon_cmd, self.rate, self.name, d_path) cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
return SynthFile(word, phoneme, fname, self.name, self.rate, variant) return SynthFile(word, phoneme, r_path, self.name, self.lang, self.rate, variant)
def synth_generator(): def synth_generator():
@ -109,11 +138,19 @@ def synth_generator():
voice_attrs = [ voice_attrs = [
NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
] ]
# sk = [k for k in voice_attrs[0].keys() if k not in [
# 'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
# s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
# and 'VoiceRelativeDesirability' in v]
us_voices_ids = [ us_voices_ids = [
v['VoiceIdentifier'] for v in voice_attrs (v['VoiceIdentifier'],
v['VoiceName'],
v['VoiceLanguage']) for v in voice_attrs
if v['VoiceLanguage'] == 'en-US' if v['VoiceLanguage'] == 'en-US'
and v['VoiceIdentifier'].split('.')[-1][0].isupper() and v['VoiceGender'] != 'VoiceGenderNeuter'
] ]
# import pdb
# pdb.set_trace()
# us_voices_ids = ['com.apple.speech.synthesis.voice.Fred', # us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',
# 'com.apple.speech.synthesis.voice.Alex', # 'com.apple.speech.synthesis.voice.Alex',
# 'com.apple.speech.synthesis.voice.Victoria'] # 'com.apple.speech.synthesis.voice.Victoria']
@ -121,19 +158,24 @@ def synth_generator():
voice_rates = [150, 180, 210, 250] voice_rates = [150, 180, 210, 250]
voice_synths = [] voice_synths = []
create_dir(dest_dir) create_dir(dest_dir)
for v in us_voices_ids: for (i, v, l) in us_voices_ids:
for r in voice_rates: for r in voice_rates:
create_dir(dest_dir + v + '/' + r) s = SynthVariant(i, v, l, r)
voice_synths.append(SynthVariant(v, r)) if s.phoneme_capable:
print('Adding ', s)
voice_synths.append(s)
else:
print('Discarding phoneme incapable ', s)
def synth_for_words(words): def synth_for_words(words, writer):
all_synths = [] prog_title = "Synthesizing {} words : ".format(len(words))
for w in words: (update, prog) = prog_bar(prog_title)
for w in prog(words):
for s in voice_synths: for s in voice_synths:
for v in ['low', 'medium', 'high']: for v in ['low', 'medium', 'high']:
all_synths.append(s.generate_audio(w, v)) update('"{}" with {} variant ({})'.format(w, s, v))
return all_synths synthed = s.generate_audio(w, v)
writer(synthed)
return synth_for_words return synth_for_words
@ -147,25 +189,50 @@ def write_synths(synth_list, fname, csv=False):
f.close() f.close()
def synth_logger(fname, csv=False):
f = open(fname, 'w')
def csv_writer(s):
f.write(s.get_csv())
synth_list = []
def json_writer(s):
synth_list.append(s)
def close_file():
if csv:
f.close()
else:
json.dump([s.get_json() for s in synth_list], f)
f.close()
if csv:
return csv_writer, close_file
else:
return json_writer, close_file
def generate_audio_for_stories(): def generate_audio_for_stories():
stories_data = json.load(open('./inputs/all_stories_hs.json')) # story_file = './inputs/all_stories_hs.json'
word_list = [t[0] for i in stories_data.values() for t in i] story_file = './inputs/all_stories.json'
words_audio_synth = synth_generator() stories_data = json.load(open(story_file))
return words_audio_synth(word_list) # word_list = [t[0] for i in stories_data.values() for t in i]
word_list = [i for g in stories_data.values() for i in g]
(writer, closer) = synth_logger(dest_file, csv=True)
synth_for_words = synth_generator()
try:
synth_for_words(word_list, writer)
except:
import traceback
import sys
traceback.print_exc(file=sys.stdout)
pass
closer()
# synths = synth_generator()([OUTPUT_NAME])
# words_audio_synth = synth_generator() # write_synths(synths, dest_file, True)
# synth = NSSpeechSynthesizer.alloc().init()
# voices_installed = NSSpeechSynthesizer.availableVoices()
# voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed]
# us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
# synth.setVoice_(us_voices_ids[2])
# synth.startSpeakingString_('your')
# fname = dest_filename(word,self.name,self.rate,self.operation)
# d_path = dest_path(fname)
# d_url = dest_url(d_path)
synths = synth_generator()([OUTPUT_NAME])
# synths = generate_audio_for_stories()
write_synths(synths, dest_file, True)
# write_synths(synths,'./outputs/synths.json') # write_synths(synths,'./outputs/synths.json')
if __name__ == '__main__':
generate_audio_for_stories()