diff --git a/TODO.md b/TODO.md index 9bc98eb..3cbbec4 100644 --- a/TODO.md +++ b/TODO.md @@ -3,3 +3,9 @@ 2. train a rnn to output a vector using the spectrograms 3. train a nn to output True/False based on the acceptability of the rnn output. -> Siamese network(implementation detail) 4. validate with real world samples + +same word spoken by multiple people etc. will be low distance. two words which are very different (you can use similarity measure given in the speech_recognition repo) will have high distance. + +the one with wrong pronunciation will have medium distance from one with right pronunciation + +i also had good experience with getting non-English voices to speak out the English words to get "wrong" pronunciation - so that will be subtly different too. diff --git a/tts_samplegen.py b/tts_samplegen.py index cb27f4e..d83c86e 100644 --- a/tts_samplegen.py +++ b/tts_samplegen.py @@ -9,13 +9,15 @@ import re import subprocess import progressbar +from generate_similar import similar_phonemes + OUTPUT_NAME = 'story_sents' dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/' dest_file = './outputs/' + OUTPUT_NAME + '.csv' def prog_bar(title): - widgets = [title,progressbar.Counter(),'th entry - ', progressbar.FormatLabel( + widgets = [title, progressbar.Counter(), 'th entry - ', progressbar.FormatLabel( ''), ' [', progressbar.Bar(), '] - ', progressbar.ETA()] prog = progressbar.ProgressBar(widgets=widgets) @@ -48,12 +50,13 @@ def cli_gen_audio(speech_cmd, rate, voice, out_path): class SynthFile(object): """docstring for SynthFile.""" - def __init__(self, word, phon, filename, voice, rate, operation): + def __init__(self, word, phon, filename, voice, voice_lang, rate, operation): super(SynthFile, self).__init__() self.word = word self.phoneme = phon self.filename = filename self.voice = voice + self.voice_lang = voice_lang self.rate = rate self.variant = operation @@ -66,9 +69,15 @@ class SynthFile(object): } def get_csv(self): - return '{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice, - self.rate, self.variant, - self.filename) + cols = [self.word, self.phoneme, self.voice, + self.voice_lang, self.rate, self.variant, + self.filename] + + return ','.join([str(c) for c in cols])+'\n' + # return '{},{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice, +# self.voice_lang, self.rate, self.variant, +# self.filename) + class SynthVariant(object): @@ -90,7 +99,8 @@ class SynthVariant(object): self.name = voice self.lang = lang self.phoneme_capable = self.is_phoneme_capable() - create_dir(dest_dir + self.name + '/' + str(self.rate)) + if self.phoneme_capable: + create_dir(dest_dir + self.name + '/' + str(self.rate)) def __repr__(self): return 'Synthesizer[{} - {}]'.format(self.name, self.rate) @@ -120,7 +130,7 @@ class SynthVariant(object): d_path, r_path = dest_path(self.name, self.rate, fname) # d_url = NSURL.fileURLWithPath_(d_path) cli_gen_audio(phon_cmd, self.rate, self.name, d_path) - return SynthFile(word, phoneme, r_path, self.name, self.rate, variant) + return SynthFile(word, phoneme, r_path, self.name, self.lang, self.rate, variant) def synth_generator(): @@ -136,14 +146,8 @@ def synth_generator(): (v['VoiceIdentifier'], v['VoiceName'], v['VoiceLanguage']) for v in voice_attrs - # v['VoiceDemoText'], - # v['VoiceShowInFullListOnly'], - # v['VoiceRelativeDesirability']) if v['VoiceLanguage'] == 'en-US' and v['VoiceGender'] != 'VoiceGenderNeuter' - # and v['VoiceIdentifier'].split('.')[-1][0].isupper() - # and 'VoiceShowInFullListOnly' in v - # and 'VoiceRelativeDesirability' in v ] # import pdb # pdb.set_trace() @@ -164,7 +168,6 @@ def synth_generator(): print('Discarding phoneme incapable ', s) def synth_for_words(words, writer): - # all_synths = [] prog_title = "Synthesizing {} words : ".format(len(words)) (update, prog) = prog_bar(prog_title) for w in prog(words): @@ -173,9 +176,6 @@ def synth_generator(): update('"{}" with {} variant ({})'.format(w, s, v)) synthed = s.generate_audio(w, v) writer(synthed) - # all_synths.append(synthed) - # return all_synths - return synth_for_words @@ -228,22 +228,11 @@ def generate_audio_for_stories(): pass closer() -# words_audio_synth = synth_generator() -# synth = NSSpeechSynthesizer.alloc().init() -# voices_installed = NSSpeechSynthesizer.availableVoices() -# voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed] -# us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()] -# synth.setVoice_(us_voices_ids[2]) -# synth.startSpeakingString_('your') -# fname = dest_filename(word,self.name,self.rate,self.operation) -# d_path = dest_path(fname) -# d_url = dest_url(d_path) - - # synths = synth_generator()([OUTPUT_NAME]) # write_synths(synths, dest_file, True) # write_synths(synths,'./outputs/synths.json') + if __name__ == '__main__': generate_audio_for_stories()