wip high variant phoneme

master
Malar Kannan 2017-10-26 18:06:14 +05:30
parent e57576d6fa
commit eb3ce8b7e5
2 changed files with 24 additions and 29 deletions

View File

@ -3,3 +3,9 @@
2. train a rnn to output a vector using the spectrograms 2. train a rnn to output a vector using the spectrograms
3. train a nn to output True/False based on the acceptability of the rnn output. -> Siamese network(implementation detail) 3. train a nn to output True/False based on the acceptability of the rnn output. -> Siamese network(implementation detail)
4. validate with real world samples 4. validate with real world samples
same word spoken by multiple people etc. will be low distance. two words which are very different (you can use similarity measure given in the speech_recognition repo) will have high distance.
the one with wrong pronunciation will have medium distance from one with right pronunciation
i also had good experience with getting non-English voices to speak out the English words to get "wrong" pronunciation - so that will be subtly different too.

View File

@ -9,13 +9,15 @@ import re
import subprocess import subprocess
import progressbar import progressbar
from generate_similar import similar_phonemes
OUTPUT_NAME = 'story_sents' OUTPUT_NAME = 'story_sents'
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/' dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
dest_file = './outputs/' + OUTPUT_NAME + '.csv' dest_file = './outputs/' + OUTPUT_NAME + '.csv'
def prog_bar(title): def prog_bar(title):
widgets = [title,progressbar.Counter(),'th entry - ', progressbar.FormatLabel( widgets = [title, progressbar.Counter(), 'th entry - ', progressbar.FormatLabel(
''), ' [', progressbar.Bar(), '] - ', progressbar.ETA()] ''), ' [', progressbar.Bar(), '] - ', progressbar.ETA()]
prog = progressbar.ProgressBar(widgets=widgets) prog = progressbar.ProgressBar(widgets=widgets)
@ -48,12 +50,13 @@ def cli_gen_audio(speech_cmd, rate, voice, out_path):
class SynthFile(object): class SynthFile(object):
"""docstring for SynthFile.""" """docstring for SynthFile."""
def __init__(self, word, phon, filename, voice, rate, operation): def __init__(self, word, phon, filename, voice, voice_lang, rate, operation):
super(SynthFile, self).__init__() super(SynthFile, self).__init__()
self.word = word self.word = word
self.phoneme = phon self.phoneme = phon
self.filename = filename self.filename = filename
self.voice = voice self.voice = voice
self.voice_lang = voice_lang
self.rate = rate self.rate = rate
self.variant = operation self.variant = operation
@ -66,9 +69,15 @@ class SynthFile(object):
} }
def get_csv(self): def get_csv(self):
return '{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice, cols = [self.word, self.phoneme, self.voice,
self.rate, self.variant, self.voice_lang, self.rate, self.variant,
self.filename) self.filename]
return ','.join([str(c) for c in cols])+'\n'
# return '{},{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
# self.voice_lang, self.rate, self.variant,
# self.filename)
class SynthVariant(object): class SynthVariant(object):
@ -90,7 +99,8 @@ class SynthVariant(object):
self.name = voice self.name = voice
self.lang = lang self.lang = lang
self.phoneme_capable = self.is_phoneme_capable() self.phoneme_capable = self.is_phoneme_capable()
create_dir(dest_dir + self.name + '/' + str(self.rate)) if self.phoneme_capable:
create_dir(dest_dir + self.name + '/' + str(self.rate))
def __repr__(self): def __repr__(self):
return 'Synthesizer[{} - {}]'.format(self.name, self.rate) return 'Synthesizer[{} - {}]'.format(self.name, self.rate)
@ -120,7 +130,7 @@ class SynthVariant(object):
d_path, r_path = dest_path(self.name, self.rate, fname) d_path, r_path = dest_path(self.name, self.rate, fname)
# d_url = NSURL.fileURLWithPath_(d_path) # d_url = NSURL.fileURLWithPath_(d_path)
cli_gen_audio(phon_cmd, self.rate, self.name, d_path) cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
return SynthFile(word, phoneme, r_path, self.name, self.rate, variant) return SynthFile(word, phoneme, r_path, self.name, self.lang, self.rate, variant)
def synth_generator(): def synth_generator():
@ -136,14 +146,8 @@ def synth_generator():
(v['VoiceIdentifier'], (v['VoiceIdentifier'],
v['VoiceName'], v['VoiceName'],
v['VoiceLanguage']) for v in voice_attrs v['VoiceLanguage']) for v in voice_attrs
# v['VoiceDemoText'],
# v['VoiceShowInFullListOnly'],
# v['VoiceRelativeDesirability'])
if v['VoiceLanguage'] == 'en-US' if v['VoiceLanguage'] == 'en-US'
and v['VoiceGender'] != 'VoiceGenderNeuter' and v['VoiceGender'] != 'VoiceGenderNeuter'
# and v['VoiceIdentifier'].split('.')[-1][0].isupper()
# and 'VoiceShowInFullListOnly' in v
# and 'VoiceRelativeDesirability' in v
] ]
# import pdb # import pdb
# pdb.set_trace() # pdb.set_trace()
@ -164,7 +168,6 @@ def synth_generator():
print('Discarding phoneme incapable ', s) print('Discarding phoneme incapable ', s)
def synth_for_words(words, writer): def synth_for_words(words, writer):
# all_synths = []
prog_title = "Synthesizing {} words : ".format(len(words)) prog_title = "Synthesizing {} words : ".format(len(words))
(update, prog) = prog_bar(prog_title) (update, prog) = prog_bar(prog_title)
for w in prog(words): for w in prog(words):
@ -173,9 +176,6 @@ def synth_generator():
update('"{}" with {} variant ({})'.format(w, s, v)) update('"{}" with {} variant ({})'.format(w, s, v))
synthed = s.generate_audio(w, v) synthed = s.generate_audio(w, v)
writer(synthed) writer(synthed)
# all_synths.append(synthed)
# return all_synths
return synth_for_words return synth_for_words
@ -228,22 +228,11 @@ def generate_audio_for_stories():
pass pass
closer() closer()
# words_audio_synth = synth_generator()
# synth = NSSpeechSynthesizer.alloc().init()
# voices_installed = NSSpeechSynthesizer.availableVoices()
# voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed]
# us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
# synth.setVoice_(us_voices_ids[2])
# synth.startSpeakingString_('your')
# fname = dest_filename(word,self.name,self.rate,self.operation)
# d_path = dest_path(fname)
# d_url = dest_url(d_path)
# synths = synth_generator()([OUTPUT_NAME]) # synths = synth_generator()([OUTPUT_NAME])
# write_synths(synths, dest_file, True) # write_synths(synths, dest_file, True)
# write_synths(synths,'./outputs/synths.json') # write_synths(synths,'./outputs/synths.json')
if __name__ == '__main__': if __name__ == '__main__':
generate_audio_for_stories() generate_audio_for_stories()