wip high variant phoneme

2017-10-26 18:06:14 +05:30
parent e57576d6fa
commit eb3ce8b7e5
2 changed files with 24 additions and 29 deletions
--- a/TODO.md
+++ b/TODO.md
@@ -3,3 +3,9 @@
 2. train a rnn to output a vector using the spectrograms
 3. train a nn to output True/False based on the acceptability of the rnn output. -> Siamese network(implementation detail)
 4. validate with real world samples
 same word spoken by multiple people etc. will be low distance. two words which are very different (you can use similarity measure given in the speech_recognition repo) will have high distance.
 the one with wrong pronunciation will have medium distance from one with right pronunciation
 i also had good experience with getting non-English voices to speak out the English words to get "wrong" pronunciation - so that will be subtly different too.
--- a/tts_samplegen.py
+++ b/tts_samplegen.py
@@ -9,13 +9,15 @@ import re
 import subprocess
 import progressbar
 from generate_similar import similar_phonemes
 OUTPUT_NAME = 'story_sents'
 dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
 dest_file = './outputs/' + OUTPUT_NAME + '.csv'
 def prog_bar(title):
-    widgets = [title,progressbar.Counter(),'th entry - ', progressbar.FormatLabel(
+    widgets = [title, progressbar.Counter(), 'th entry - ', progressbar.FormatLabel(
        ''), ' [', progressbar.Bar(), '] - ', progressbar.ETA()]
    prog = progressbar.ProgressBar(widgets=widgets)
@@ -48,12 +50,13 @@ def cli_gen_audio(speech_cmd, rate, voice, out_path):
 class SynthFile(object):
    """docstring for SynthFile."""
-    def __init__(self, word, phon, filename, voice, rate, operation):
+    def __init__(self, word, phon, filename, voice, voice_lang, rate, operation):
        super(SynthFile, self).__init__()
        self.word = word
        self.phoneme = phon
        self.filename = filename
        self.voice = voice
        self.voice_lang = voice_lang
        self.rate = rate
        self.variant = operation
@@ -66,9 +69,15 @@ class SynthFile(object):
        }
    def get_csv(self):
-        return '{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
+        cols = [self.word, self.phoneme, self.voice,
-                                            self.rate, self.variant,
+                self.voice_lang, self.rate, self.variant,
-                                            self.filename)
+                self.filename]
        return ','.join([str(c) for c in cols])+'\n'
        # return '{},{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
 #                                        self.voice_lang, self.rate, self.variant,
 #                                        self.filename)
 class SynthVariant(object):
@@ -90,7 +99,8 @@ class SynthVariant(object):
        self.name = voice
        self.lang = lang
        self.phoneme_capable = self.is_phoneme_capable()
-        create_dir(dest_dir + self.name + '/' + str(self.rate))
+        if self.phoneme_capable:
            create_dir(dest_dir + self.name + '/' + str(self.rate))
    def __repr__(self):
        return 'Synthesizer[{} - {}]'.format(self.name, self.rate)
@@ -120,7 +130,7 @@ class SynthVariant(object):
        d_path, r_path = dest_path(self.name, self.rate, fname)
        # d_url = NSURL.fileURLWithPath_(d_path)
        cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
-        return SynthFile(word, phoneme, r_path, self.name, self.rate, variant)
+        return SynthFile(word, phoneme, r_path, self.name, self.lang, self.rate, variant)
 def synth_generator():
@@ -136,14 +146,8 @@ def synth_generator():
        (v['VoiceIdentifier'],
         v['VoiceName'],
         v['VoiceLanguage']) for v in voice_attrs
        #  v['VoiceDemoText'],
        #  v['VoiceShowInFullListOnly'],
        #  v['VoiceRelativeDesirability'])
        if v['VoiceLanguage'] == 'en-US'
        and v['VoiceGender'] != 'VoiceGenderNeuter'
        # and v['VoiceIdentifier'].split('.')[-1][0].isupper()
        # and 'VoiceShowInFullListOnly' in v
        # and 'VoiceRelativeDesirability' in v
    ]
    # import pdb
    # pdb.set_trace()
@@ -164,7 +168,6 @@ def synth_generator():
                print('Discarding phoneme incapable ', s)
    def synth_for_words(words, writer):
        # all_synths = []
        prog_title = "Synthesizing {} words : ".format(len(words))
        (update, prog) = prog_bar(prog_title)
        for w in prog(words):
@@ -173,9 +176,6 @@ def synth_generator():
                    update('"{}" with {} variant ({})'.format(w, s, v))
                    synthed = s.generate_audio(w, v)
                    writer(synthed)
                    # all_synths.append(synthed)
        # return all_synths
    return synth_for_words
@@ -228,22 +228,11 @@ def generate_audio_for_stories():
        pass
    closer()
 # words_audio_synth = synth_generator()
 # synth = NSSpeechSynthesizer.alloc().init()
 # voices_installed = NSSpeechSynthesizer.availableVoices()
 # voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in  voices_installed]
 # us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
 # synth.setVoice_(us_voices_ids[2])
 # synth.startSpeakingString_('your')
 # fname = dest_filename(word,self.name,self.rate,self.operation)
 # d_path = dest_path(fname)
 # d_url = dest_url(d_path)
 # synths = synth_generator()([OUTPUT_NAME])
 # write_synths(synths, dest_file, True)
 # write_synths(synths,'./outputs/synths.json')
 if __name__ == '__main__':
    generate_audio_for_stories()