wip high variant phoneme

master
Malar Kannan 2017-10-26 18:06:14 +05:30
parent e57576d6fa
commit eb3ce8b7e5
2 changed files with 24 additions and 29 deletions

View File

@ -3,3 +3,9 @@
2. train a rnn to output a vector using the spectrograms
3. train a nn to output True/False based on the acceptability of the rnn output. -> Siamese network(implementation detail)
4. validate with real world samples
same word spoken by multiple people etc. will be low distance. two words which are very different (you can use similarity measure given in the speech_recognition repo) will have high distance.
the one with wrong pronunciation will have medium distance from one with right pronunciation
i also had good experience with getting non-English voices to speak out the English words to get "wrong" pronunciation - so that will be subtly different too.

View File

@ -9,13 +9,15 @@ import re
import subprocess
import progressbar
from generate_similar import similar_phonemes
OUTPUT_NAME = 'story_sents'
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
dest_file = './outputs/' + OUTPUT_NAME + '.csv'
def prog_bar(title):
widgets = [title,progressbar.Counter(),'th entry - ', progressbar.FormatLabel(
widgets = [title, progressbar.Counter(), 'th entry - ', progressbar.FormatLabel(
''), ' [', progressbar.Bar(), '] - ', progressbar.ETA()]
prog = progressbar.ProgressBar(widgets=widgets)
@ -48,12 +50,13 @@ def cli_gen_audio(speech_cmd, rate, voice, out_path):
class SynthFile(object):
"""docstring for SynthFile."""
def __init__(self, word, phon, filename, voice, rate, operation):
def __init__(self, word, phon, filename, voice, voice_lang, rate, operation):
super(SynthFile, self).__init__()
self.word = word
self.phoneme = phon
self.filename = filename
self.voice = voice
self.voice_lang = voice_lang
self.rate = rate
self.variant = operation
@ -66,9 +69,15 @@ class SynthFile(object):
}
def get_csv(self):
return '{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
self.rate, self.variant,
self.filename)
cols = [self.word, self.phoneme, self.voice,
self.voice_lang, self.rate, self.variant,
self.filename]
return ','.join([str(c) for c in cols])+'\n'
# return '{},{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
# self.voice_lang, self.rate, self.variant,
# self.filename)
class SynthVariant(object):
@ -90,6 +99,7 @@ class SynthVariant(object):
self.name = voice
self.lang = lang
self.phoneme_capable = self.is_phoneme_capable()
if self.phoneme_capable:
create_dir(dest_dir + self.name + '/' + str(self.rate))
def __repr__(self):
@ -120,7 +130,7 @@ class SynthVariant(object):
d_path, r_path = dest_path(self.name, self.rate, fname)
# d_url = NSURL.fileURLWithPath_(d_path)
cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
return SynthFile(word, phoneme, r_path, self.name, self.rate, variant)
return SynthFile(word, phoneme, r_path, self.name, self.lang, self.rate, variant)
def synth_generator():
@ -136,14 +146,8 @@ def synth_generator():
(v['VoiceIdentifier'],
v['VoiceName'],
v['VoiceLanguage']) for v in voice_attrs
# v['VoiceDemoText'],
# v['VoiceShowInFullListOnly'],
# v['VoiceRelativeDesirability'])
if v['VoiceLanguage'] == 'en-US'
and v['VoiceGender'] != 'VoiceGenderNeuter'
# and v['VoiceIdentifier'].split('.')[-1][0].isupper()
# and 'VoiceShowInFullListOnly' in v
# and 'VoiceRelativeDesirability' in v
]
# import pdb
# pdb.set_trace()
@ -164,7 +168,6 @@ def synth_generator():
print('Discarding phoneme incapable ', s)
def synth_for_words(words, writer):
# all_synths = []
prog_title = "Synthesizing {} words : ".format(len(words))
(update, prog) = prog_bar(prog_title)
for w in prog(words):
@ -173,9 +176,6 @@ def synth_generator():
update('"{}" with {} variant ({})'.format(w, s, v))
synthed = s.generate_audio(w, v)
writer(synthed)
# all_synths.append(synthed)
# return all_synths
return synth_for_words
@ -228,22 +228,11 @@ def generate_audio_for_stories():
pass
closer()
# words_audio_synth = synth_generator()
# synth = NSSpeechSynthesizer.alloc().init()
# voices_installed = NSSpeechSynthesizer.availableVoices()
# voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed]
# us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
# synth.setVoice_(us_voices_ids[2])
# synth.startSpeakingString_('your')
# fname = dest_filename(word,self.name,self.rate,self.operation)
# d_path = dest_path(fname)
# d_url = dest_url(d_path)
# synths = synth_generator()([OUTPUT_NAME])
# write_synths(synths, dest_file, True)
# write_synths(synths,'./outputs/synths.json')
if __name__ == '__main__':
generate_audio_for_stories()