wip high variant phoneme

discarding phoneme incapable synthesizers
fixed progress
2017-10-26 18:06:14 +05:30 · 2017-10-26 16:51:32 +05:30 · 2017-10-26 16:18:17 +05:30 · 2017-10-26 15:58:25 +05:30 · 2017-10-26 15:27:22 +05:30
3 changed files with 121 additions and 47 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -138,7 +138,7 @@ Temporary Items
 # End of https://www.gitignore.io/api/macos

 outputs/*
-inputs/mnist
+inputs/*
 inputs/audio*
 logs/*
 models/*
--- a/TODO.md
+++ b/TODO.md
@@ -2,3 +2,10 @@
 1. create spectrograms of 150ms windows with 50ms overlap for each word.
 2. train a rnn to output a vector using the spectrograms
 3. train a nn to output True/False based on the acceptability of the rnn output. -> Siamese network(implementation detail)
+4. validate with real world samples
+
+same word spoken by multiple people etc. will be low distance. two words which are very different (you can use similarity measure given in the speech_recognition repo) will have high distance.
+
+the one with wrong pronunciation will have medium distance from one with right pronunciation
+
+i also had good experience with getting non-English voices to speak out the English words to get "wrong" pronunciation - so that will be subtly different too.
--- a/tts_samplegen.py
+++ b/tts_samplegen.py
@@ -7,24 +7,38 @@ import random
 import os
 import re
 import subprocess
+import progressbar

-OUTPUT_NAME = 'audio'
+from generate_similar import similar_phonemes
+
+OUTPUT_NAME = 'story_sents'
 dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
 dest_file = './outputs/' + OUTPUT_NAME + '.csv'


+def prog_bar(title):
+    widgets = [title, progressbar.Counter(), 'th entry - ', progressbar.FormatLabel(
+        ''), ' [', progressbar.Bar(), '] - ', progressbar.ETA()]
+    prog = progressbar.ProgressBar(widgets=widgets)
+
+    def update_prog(current):
+        widgets[3] = progressbar.FormatLabel(current)
+        prog.update()
+    return (update_prog, prog)
+
+
 def create_dir(direc):
    if not os.path.exists(direc):
-        os.mkdir(direc)
+        os.makedirs(direc)


-def dest_filename(n, v, r, t):
-    return '{}-{}-{}-{}-'.format(n, v, r,
-                                 t) + str(random.randint(0, 10000)) + '.aiff'
+def dest_filename(w, v, r, t):
+    return '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, str(random.randint(0, 10000)))


 def dest_path(v, r, n):
-    return dest_dir + v + '/' + r + '/' + n
+    rel = v + '/' + str(r) + '/' + n
+    return (dest_dir + rel), rel


 def cli_gen_audio(speech_cmd, rate, voice, out_path):
@@ -36,12 +50,13 @@ def cli_gen_audio(speech_cmd, rate, voice, out_path):
 class SynthFile(object):
    """docstring for SynthFile."""

-    def __init__(self, word, phon, filename, voice, rate, operation):
+    def __init__(self, word, phon, filename, voice, voice_lang, rate, operation):
        super(SynthFile, self).__init__()
        self.word = word
        self.phoneme = phon
        self.filename = filename
        self.voice = voice
+        self.voice_lang = voice_lang
        self.rate = rate
        self.variant = operation

@@ -54,15 +69,21 @@ class SynthFile(object):
        }

    def get_csv(self):
-        return '{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
-                                         self.rate, self.variant,
-                                         self.filename)
+        cols = [self.word, self.phoneme, self.voice,
+                self.voice_lang, self.rate, self.variant,
+                self.filename]
+
+        return ','.join([str(c) for c in cols])+'\n'
+        # return '{},{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
+#                                        self.voice_lang, self.rate, self.variant,
+#                                        self.filename)
+


 class SynthVariant(object):
    """docstring for SynthVariant."""

-    def __init__(self, identifier, rate):
+    def __init__(self, identifier, voice, lang, rate):
        super(SynthVariant, self).__init__()
        self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
        self.synth.setVolume_(100)
@@ -75,10 +96,18 @@ class SynthVariant(object):
            NSSpeechModePhoneme, NSSpeechInputModeProperty, None)
        self.identifier = identifier
        self.rate = rate
-        self.name = identifier.split('.')[-1]
+        self.name = voice
+        self.lang = lang
+        self.phoneme_capable = self.is_phoneme_capable()
+        if self.phoneme_capable:
+            create_dir(dest_dir + self.name + '/' + str(self.rate))

    def __repr__(self):
-        return 'Synthesizer[{} - {}]({})'.format(self.name, self.rate)
+        return 'Synthesizer[{} - {}]'.format(self.name, self.rate)
+
+    def is_phoneme_capable(self):
+        orig_phon = self.synth.phonemesFromText_('water')
+        return orig_phon != ''

    def generate_audio(self, word, variant):
        orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(
@@ -91,17 +120,17 @@ class SynthVariant(object):
            phon_cmd = '[[inpt PHON]] ' + phoneme
        elif variant == 'high':
            phoneme = orig_phon
-            phon_cmd = word
+            phon_cmd = '[[inpt PHON]] ' + phoneme
        # elif variant == 'long':
        # if phon != '':
        # self.phone_synth.startSpeakingString_toURL_(phon,d_url)
        # else:
        #     self.synth.startSpeakingString_toURL_(word,d_url)
-        fname = dest_filename(word, phoneme, self.name, self.rate)
-        d_path = dest_path(self.name, self.rate, fname)
+        fname = dest_filename(word, self.name, self.rate, variant)
+        d_path, r_path = dest_path(self.name, self.rate, fname)
        # d_url = NSURL.fileURLWithPath_(d_path)
        cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
-        return SynthFile(word, phoneme, fname, self.name, self.rate, variant)
+        return SynthFile(word, phoneme, r_path, self.name, self.lang, self.rate, variant)


 def synth_generator():
@@ -109,11 +138,19 @@ def synth_generator():
    voice_attrs = [
        NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
    ]
+    # sk = [k for k in voice_attrs[0].keys() if k not in [
+    #     'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
+    # s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
+    #            and 'VoiceRelativeDesirability' in v]
    us_voices_ids = [
-        v['VoiceIdentifier'] for v in voice_attrs
+        (v['VoiceIdentifier'],
+         v['VoiceName'],
+         v['VoiceLanguage']) for v in voice_attrs
        if v['VoiceLanguage'] == 'en-US'
-        and v['VoiceIdentifier'].split('.')[-1][0].isupper()
+        and v['VoiceGender'] != 'VoiceGenderNeuter'
    ]
+    # import pdb
+    # pdb.set_trace()
    # us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',
    #                  'com.apple.speech.synthesis.voice.Alex',
    #                  'com.apple.speech.synthesis.voice.Victoria']
@@ -121,19 +158,24 @@ def synth_generator():
    voice_rates = [150, 180, 210, 250]
    voice_synths = []
    create_dir(dest_dir)
-    for v in us_voices_ids:
+    for (i, v, l) in us_voices_ids:
        for r in voice_rates:
-            create_dir(dest_dir + v + '/' + r)
-            voice_synths.append(SynthVariant(v, r))
+            s = SynthVariant(i, v, l, r)
+            if s.phoneme_capable:
+                print('Adding ', s)
+                voice_synths.append(s)
+            else:
+                print('Discarding phoneme incapable ', s)

-    def synth_for_words(words):
-        all_synths = []
-        for w in words:
+    def synth_for_words(words, writer):
+        prog_title = "Synthesizing {} words : ".format(len(words))
+        (update, prog) = prog_bar(prog_title)
+        for w in prog(words):
            for s in voice_synths:
                for v in ['low', 'medium', 'high']:
-                    all_synths.append(s.generate_audio(w, v))
-        return all_synths
-
+                    update('"{}" with {} variant ({})'.format(w, s, v))
+                    synthed = s.generate_audio(w, v)
+                    writer(synthed)
    return synth_for_words


@@ -147,25 +189,50 @@ def write_synths(synth_list, fname, csv=False):
    f.close()


+def synth_logger(fname, csv=False):
+    f = open(fname, 'w')
+
+    def csv_writer(s):
+        f.write(s.get_csv())
+    synth_list = []
+
+    def json_writer(s):
+        synth_list.append(s)
+
+    def close_file():
+        if csv:
+            f.close()
+        else:
+            json.dump([s.get_json() for s in synth_list], f)
+            f.close()
+    if csv:
+        return csv_writer, close_file
+    else:
+        return json_writer, close_file
+
+
 def generate_audio_for_stories():
-    stories_data = json.load(open('./inputs/all_stories_hs.json'))
-    word_list = [t[0] for i in stories_data.values() for t in i]
-    words_audio_synth = synth_generator()
-    return words_audio_synth(word_list)
+    # story_file = './inputs/all_stories_hs.json'
+    story_file = './inputs/all_stories.json'
+    stories_data = json.load(open(story_file))
+    # word_list = [t[0] for i in stories_data.values() for t in i]
+    word_list = [i for g in stories_data.values() for i in g]
+    (writer, closer) = synth_logger(dest_file, csv=True)
+    synth_for_words = synth_generator()
+    try:
+        synth_for_words(word_list, writer)
+    except:
+        import traceback
+        import sys
+        traceback.print_exc(file=sys.stdout)
+        pass
+    closer()

+# synths = synth_generator()([OUTPUT_NAME])

-# words_audio_synth = synth_generator()
-# synth = NSSpeechSynthesizer.alloc().init()
-# voices_installed = NSSpeechSynthesizer.availableVoices()
-# voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in  voices_installed]
-# us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
-# synth.setVoice_(us_voices_ids[2])
-# synth.startSpeakingString_('your')
-# fname = dest_filename(word,self.name,self.rate,self.operation)
-# d_path = dest_path(fname)
-# d_url = dest_url(d_path)
-
-synths = synth_generator()([OUTPUT_NAME])
-# synths = generate_audio_for_stories()
-write_synths(synths, dest_file, True)
+# write_synths(synths, dest_file, True)
 # write_synths(synths,'./outputs/synths.json')
+
+
+if __name__ == '__main__':
+    generate_audio_for_stories()
Author	SHA1	Message	Date
Malar Kannan	eb3ce8b7e5	wip high variant phoneme	2017-10-26 18:06:14 +05:30
Malar Kannan	e57576d6fa	discarding phoneme incapable synthesizers	2017-10-26 16:51:32 +05:30
Malar Kannan	a953fa3355	fixed progress	2017-10-26 16:18:17 +05:30
Malar Kannan	7a520b79f4	writing to csv proactively	2017-10-26 15:58:25 +05:30
Malar Kannan	05f36daf7e	refactored sample generation code	2017-10-26 15:27:22 +05:30