wip high variant phoneme

2017-10-26 18:06:14 +05:30
parent e57576d6fa
commit eb3ce8b7e5
2 changed files with 24 additions and 29 deletions
--- a/TODO.md
+++ b/TODO.md
@@ -3,3 +3,9 @@
 2. train a rnn to output a vector using the spectrograms
 3. train a nn to output True/False based on the acceptability of the rnn output. -> Siamese network(implementation detail)
 4. validate with real world samples
+
+same word spoken by multiple people etc. will be low distance. two words which are very different (you can use similarity measure given in the speech_recognition repo) will have high distance.
+
+the one with wrong pronunciation will have medium distance from one with right pronunciation
+
+i also had good experience with getting non-English voices to speak out the English words to get "wrong" pronunciation - so that will be subtly different too.
--- a/tts_samplegen.py
+++ b/tts_samplegen.py
@@ -9,13 +9,15 @@ import re
 import subprocess
 import progressbar

+from generate_similar import similar_phonemes
+
 OUTPUT_NAME = 'story_sents'
 dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
 dest_file = './outputs/' + OUTPUT_NAME + '.csv'


 def prog_bar(title):
-    widgets = [title,progressbar.Counter(),'th entry - ', progressbar.FormatLabel(
+    widgets = [title, progressbar.Counter(), 'th entry - ', progressbar.FormatLabel(
        ''), ' [', progressbar.Bar(), '] - ', progressbar.ETA()]
    prog = progressbar.ProgressBar(widgets=widgets)

@@ -48,12 +50,13 @@ def cli_gen_audio(speech_cmd, rate, voice, out_path):
 class SynthFile(object):
    """docstring for SynthFile."""

-    def __init__(self, word, phon, filename, voice, rate, operation):
+    def __init__(self, word, phon, filename, voice, voice_lang, rate, operation):
        super(SynthFile, self).__init__()
        self.word = word
        self.phoneme = phon
        self.filename = filename
        self.voice = voice
+        self.voice_lang = voice_lang
        self.rate = rate
        self.variant = operation

@@ -66,9 +69,15 @@ class SynthFile(object):
        }

    def get_csv(self):
-        return '{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
-                                            self.rate, self.variant,
-                                            self.filename)
+        cols = [self.word, self.phoneme, self.voice,
+                self.voice_lang, self.rate, self.variant,
+                self.filename]
+
+        return ','.join([str(c) for c in cols])+'\n'
+        # return '{},{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
+#                                        self.voice_lang, self.rate, self.variant,
+#                                        self.filename)
+


 class SynthVariant(object):
@@ -90,7 +99,8 @@ class SynthVariant(object):
        self.name = voice
        self.lang = lang
        self.phoneme_capable = self.is_phoneme_capable()
-        create_dir(dest_dir + self.name + '/' + str(self.rate))
+        if self.phoneme_capable:
+            create_dir(dest_dir + self.name + '/' + str(self.rate))

    def __repr__(self):
        return 'Synthesizer[{} - {}]'.format(self.name, self.rate)
@@ -120,7 +130,7 @@ class SynthVariant(object):
        d_path, r_path = dest_path(self.name, self.rate, fname)
        # d_url = NSURL.fileURLWithPath_(d_path)
        cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
-        return SynthFile(word, phoneme, r_path, self.name, self.rate, variant)
+        return SynthFile(word, phoneme, r_path, self.name, self.lang, self.rate, variant)


 def synth_generator():
@@ -136,14 +146,8 @@ def synth_generator():
        (v['VoiceIdentifier'],
         v['VoiceName'],
         v['VoiceLanguage']) for v in voice_attrs
-        #  v['VoiceDemoText'],
-        #  v['VoiceShowInFullListOnly'],
-        #  v['VoiceRelativeDesirability'])
        if v['VoiceLanguage'] == 'en-US'
        and v['VoiceGender'] != 'VoiceGenderNeuter'
-        # and v['VoiceIdentifier'].split('.')[-1][0].isupper()
-        # and 'VoiceShowInFullListOnly' in v
-        # and 'VoiceRelativeDesirability' in v
    ]
    # import pdb
    # pdb.set_trace()
@@ -164,7 +168,6 @@ def synth_generator():
                print('Discarding phoneme incapable ', s)

    def synth_for_words(words, writer):
-        # all_synths = []
        prog_title = "Synthesizing {} words : ".format(len(words))
        (update, prog) = prog_bar(prog_title)
        for w in prog(words):
@@ -173,9 +176,6 @@ def synth_generator():
                    update('"{}" with {} variant ({})'.format(w, s, v))
                    synthed = s.generate_audio(w, v)
                    writer(synthed)
-                    # all_synths.append(synthed)
-        # return all_synths
-
    return synth_for_words


@@ -228,22 +228,11 @@ def generate_audio_for_stories():
        pass
    closer()

-# words_audio_synth = synth_generator()
-# synth = NSSpeechSynthesizer.alloc().init()
-# voices_installed = NSSpeechSynthesizer.availableVoices()
-# voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in  voices_installed]
-# us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
-# synth.setVoice_(us_voices_ids[2])
-# synth.startSpeakingString_('your')
-# fname = dest_filename(word,self.name,self.rate,self.operation)
-# d_path = dest_path(fname)
-# d_url = dest_url(d_path)
-
-
 # synths = synth_generator()([OUTPUT_NAME])

 # write_synths(synths, dest_file, True)
 # write_synths(synths,'./outputs/synths.json')

+
 if __name__ == '__main__':
    generate_audio_for_stories()