wip high variant phoneme

discarding phoneme incapable synthesizers
fixed progress
2017-10-26 18:06:14 +05:30 · 2017-10-26 16:51:32 +05:30 · 2017-10-26 16:18:17 +05:30 · 2017-10-26 15:58:25 +05:30 · 2017-10-26 15:27:22 +05:30
3 changed files with 121 additions and 47 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -138,7 +138,7 @@ Temporary Items
 # End of https://www.gitignore.io/api/macos
 outputs/*
-inputs/mnist
+inputs/*
 inputs/audio*
 logs/*
 models/*
--- a/TODO.md
+++ b/TODO.md
@@ -2,3 +2,10 @@
 1. create spectrograms of 150ms windows with 50ms overlap for each word.
 2. train a rnn to output a vector using the spectrograms
 3. train a nn to output True/False based on the acceptability of the rnn output. -> Siamese network(implementation detail)
 4. validate with real world samples
 same word spoken by multiple people etc. will be low distance. two words which are very different (you can use similarity measure given in the speech_recognition repo) will have high distance.
 the one with wrong pronunciation will have medium distance from one with right pronunciation
 i also had good experience with getting non-English voices to speak out the English words to get "wrong" pronunciation - so that will be subtly different too.
--- a/tts_samplegen.py
+++ b/tts_samplegen.py
@@ -7,24 +7,38 @@ import random
 import os
 import re
 import subprocess
 import progressbar
-OUTPUT_NAME = 'audio'
+from generate_similar import similar_phonemes
 OUTPUT_NAME = 'story_sents'
 dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
 dest_file = './outputs/' + OUTPUT_NAME + '.csv'
 def prog_bar(title):
    widgets = [title, progressbar.Counter(), 'th entry - ', progressbar.FormatLabel(
        ''), ' [', progressbar.Bar(), '] - ', progressbar.ETA()]
    prog = progressbar.ProgressBar(widgets=widgets)
    def update_prog(current):
        widgets[3] = progressbar.FormatLabel(current)
        prog.update()
    return (update_prog, prog)
 def create_dir(direc):
    if not os.path.exists(direc):
-        os.mkdir(direc)
+        os.makedirs(direc)
-def dest_filename(n, v, r, t):
+def dest_filename(w, v, r, t):
-    return '{}-{}-{}-{}-'.format(n, v, r,
+    return '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, str(random.randint(0, 10000)))
                                 t) + str(random.randint(0, 10000)) + '.aiff'
 def dest_path(v, r, n):
-    return dest_dir + v + '/' + r + '/' + n
+    rel = v + '/' + str(r) + '/' + n
    return (dest_dir + rel), rel
 def cli_gen_audio(speech_cmd, rate, voice, out_path):
@@ -36,12 +50,13 @@ def cli_gen_audio(speech_cmd, rate, voice, out_path):
 class SynthFile(object):
    """docstring for SynthFile."""
-    def __init__(self, word, phon, filename, voice, rate, operation):
+    def __init__(self, word, phon, filename, voice, voice_lang, rate, operation):
        super(SynthFile, self).__init__()
        self.word = word
        self.phoneme = phon
        self.filename = filename
        self.voice = voice
        self.voice_lang = voice_lang
        self.rate = rate
        self.variant = operation
@@ -54,15 +69,21 @@ class SynthFile(object):
        }
    def get_csv(self):
-        return '{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
+        cols = [self.word, self.phoneme, self.voice,
-                                         self.rate, self.variant,
+                self.voice_lang, self.rate, self.variant,
-                                         self.filename)
+                self.filename]
        return ','.join([str(c) for c in cols])+'\n'
        # return '{},{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
 #                                        self.voice_lang, self.rate, self.variant,
 #                                        self.filename)
 class SynthVariant(object):
    """docstring for SynthVariant."""
-    def __init__(self, identifier, rate):
+    def __init__(self, identifier, voice, lang, rate):
        super(SynthVariant, self).__init__()
        self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
        self.synth.setVolume_(100)
@@ -75,10 +96,18 @@ class SynthVariant(object):
            NSSpeechModePhoneme, NSSpeechInputModeProperty, None)
        self.identifier = identifier
        self.rate = rate
-        self.name = identifier.split('.')[-1]
+        self.name = voice
        self.lang = lang
        self.phoneme_capable = self.is_phoneme_capable()
        if self.phoneme_capable:
            create_dir(dest_dir + self.name + '/' + str(self.rate))
    def __repr__(self):
-        return 'Synthesizer[{} - {}]({})'.format(self.name, self.rate)
+        return 'Synthesizer[{} - {}]'.format(self.name, self.rate)
    def is_phoneme_capable(self):
        orig_phon = self.synth.phonemesFromText_('water')
        return orig_phon != ''
    def generate_audio(self, word, variant):
        orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(
@@ -91,17 +120,17 @@ class SynthVariant(object):
            phon_cmd = '[[inpt PHON]] ' + phoneme
        elif variant == 'high':
            phoneme = orig_phon
-            phon_cmd = word
+            phon_cmd = '[[inpt PHON]] ' + phoneme
        # elif variant == 'long':
        # if phon != '':
        # self.phone_synth.startSpeakingString_toURL_(phon,d_url)
        # else:
        #     self.synth.startSpeakingString_toURL_(word,d_url)
-        fname = dest_filename(word, phoneme, self.name, self.rate)
+        fname = dest_filename(word, self.name, self.rate, variant)
-        d_path = dest_path(self.name, self.rate, fname)
+        d_path, r_path = dest_path(self.name, self.rate, fname)
        # d_url = NSURL.fileURLWithPath_(d_path)
        cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
-        return SynthFile(word, phoneme, fname, self.name, self.rate, variant)
+        return SynthFile(word, phoneme, r_path, self.name, self.lang, self.rate, variant)
 def synth_generator():
@@ -109,11 +138,19 @@ def synth_generator():
    voice_attrs = [
        NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
    ]
    # sk = [k for k in voice_attrs[0].keys() if k not in [
    #     'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
    # s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
    #            and 'VoiceRelativeDesirability' in v]
    us_voices_ids = [
-        v['VoiceIdentifier'] for v in voice_attrs
+        (v['VoiceIdentifier'],
         v['VoiceName'],
         v['VoiceLanguage']) for v in voice_attrs
        if v['VoiceLanguage'] == 'en-US'
-        and v['VoiceIdentifier'].split('.')[-1][0].isupper()
+        and v['VoiceGender'] != 'VoiceGenderNeuter'
    ]
    # import pdb
    # pdb.set_trace()
    # us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',
    #                  'com.apple.speech.synthesis.voice.Alex',
    #                  'com.apple.speech.synthesis.voice.Victoria']
@@ -121,19 +158,24 @@ def synth_generator():
    voice_rates = [150, 180, 210, 250]
    voice_synths = []
    create_dir(dest_dir)
-    for v in us_voices_ids:
+    for (i, v, l) in us_voices_ids:
        for r in voice_rates:
-            create_dir(dest_dir + v + '/' + r)
+            s = SynthVariant(i, v, l, r)
-            voice_synths.append(SynthVariant(v, r))
+            if s.phoneme_capable:
                print('Adding ', s)
                voice_synths.append(s)
            else:
                print('Discarding phoneme incapable ', s)
-    def synth_for_words(words):
+    def synth_for_words(words, writer):
-        all_synths = []
+        prog_title = "Synthesizing {} words : ".format(len(words))
-        for w in words:
+        (update, prog) = prog_bar(prog_title)
        for w in prog(words):
            for s in voice_synths:
                for v in ['low', 'medium', 'high']:
-                    all_synths.append(s.generate_audio(w, v))
+                    update('"{}" with {} variant ({})'.format(w, s, v))
-        return all_synths
+                    synthed = s.generate_audio(w, v)
-
+                    writer(synthed)
    return synth_for_words
@@ -147,25 +189,50 @@ def write_synths(synth_list, fname, csv=False):
    f.close()
 def synth_logger(fname, csv=False):
    f = open(fname, 'w')
    def csv_writer(s):
        f.write(s.get_csv())
    synth_list = []
    def json_writer(s):
        synth_list.append(s)
    def close_file():
        if csv:
            f.close()
        else:
            json.dump([s.get_json() for s in synth_list], f)
            f.close()
    if csv:
        return csv_writer, close_file
    else:
        return json_writer, close_file
 def generate_audio_for_stories():
-    stories_data = json.load(open('./inputs/all_stories_hs.json'))
+    # story_file = './inputs/all_stories_hs.json'
-    word_list = [t[0] for i in stories_data.values() for t in i]
+    story_file = './inputs/all_stories.json'
-    words_audio_synth = synth_generator()
+    stories_data = json.load(open(story_file))
-    return words_audio_synth(word_list)
+    # word_list = [t[0] for i in stories_data.values() for t in i]
    word_list = [i for g in stories_data.values() for i in g]
    (writer, closer) = synth_logger(dest_file, csv=True)
    synth_for_words = synth_generator()
    try:
        synth_for_words(word_list, writer)
    except:
        import traceback
        import sys
        traceback.print_exc(file=sys.stdout)
        pass
    closer()
 # synths = synth_generator()([OUTPUT_NAME])
-# words_audio_synth = synth_generator()
+# write_synths(synths, dest_file, True)
 # synth = NSSpeechSynthesizer.alloc().init()
 # voices_installed = NSSpeechSynthesizer.availableVoices()
 # voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in  voices_installed]
 # us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
 # synth.setVoice_(us_voices_ids[2])
 # synth.startSpeakingString_('your')
 # fname = dest_filename(word,self.name,self.rate,self.operation)
 # d_path = dest_path(fname)
 # d_url = dest_url(d_path)
 synths = synth_generator()([OUTPUT_NAME])
 # synths = generate_audio_for_stories()
 write_synths(synths, dest_file, True)
 # write_synths(synths,'./outputs/synths.json')
 if __name__ == '__main__':
    generate_audio_for_stories()
Author	SHA1	Message	Date
Malar Kannan	eb3ce8b7e5	wip high variant phoneme	2017-10-26 18:06:14 +05:30
Malar Kannan	e57576d6fa	discarding phoneme incapable synthesizers	2017-10-26 16:51:32 +05:30
Malar Kannan	a953fa3355	fixed progress	2017-10-26 16:18:17 +05:30
Malar Kannan	7a520b79f4	writing to csv proactively	2017-10-26 15:58:25 +05:30
Malar Kannan	05f36daf7e	refactored sample generation code	2017-10-26 15:27:22 +05:30