speech-scoring/tts_samplegen.py

import objc
from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty
from AppKit import NSSpeechModePhoneme
from Foundation import NSURL
import json
import random
import os
import re
import subprocess
import progressbar

OUTPUT_NAME = 'story_sents'
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
dest_file = './outputs/' + OUTPUT_NAME + '.csv'


def prog_bar(title):
    widgets = [progressbar.FormatLabel(
        title), ' [', progressbar.Bar(), '] - ', progressbar.ETA()]
    prog = progressbar.ProgressBar(widgets=widgets)

    def update_prog(current):
        widgets[0] = progressbar.FormatLabel('{} : {}'.format(title, current))
        prog.update()
    return (update_prog, prog)


def create_dir(direc):
    if not os.path.exists(direc):
        os.makedirs(direc)


def dest_filename(w, v, r, t):
    return '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, str(random.randint(0, 10000)))


def dest_path(v, r, n):
    return dest_dir + v + '/' + str(r) + '/' + n


def cli_gen_audio(speech_cmd, rate, voice, out_path):
    subprocess.call(
        ['say', '-v', voice, '-r',
         str(rate), '-o', out_path, speech_cmd])


class SynthFile(object):
    """docstring for SynthFile."""

    def __init__(self, word, phon, filename, voice, rate, operation):
        super(SynthFile, self).__init__()
        self.word = word
        self.phoneme = phon
        self.filename = filename
        self.voice = voice
        self.rate = rate
        self.variant = operation

    def get_json(self):
        return {
            'filename': self.filename,
            'voice': self.voice,
            'rate': self.rate,
            'operation': self.operation
        }

    def get_csv(self):
        return '{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
                                         self.rate, self.variant,
                                         self.filename)


class SynthVariant(object):
    """docstring for SynthVariant."""

    def __init__(self, identifier, voice, lang, rate):
        super(SynthVariant, self).__init__()
        self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
        self.synth.setVolume_(100)
        self.synth.setRate_(rate)
        self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(
            identifier)
        self.phone_synth.setVolume_(100)
        self.phone_synth.setRate_(rate)
        self.phone_synth.setObject_forProperty_error_(
            NSSpeechModePhoneme, NSSpeechInputModeProperty, None)
        self.identifier = identifier
        self.rate = rate
        self.name = voice
        self.lang = lang
        create_dir(dest_dir + self.name + '/' + str(self.rate))

    def __repr__(self):
        return 'Synthesizer[{} - {}]'.format(self.name, self.rate)

    def generate_audio(self, word, variant):
        orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(
            word), '', word
        if variant == 'low':
            # self.synth.startSpeakingString_toURL_(word,d_url)
            phoneme = orig_phon
        elif variant == 'medium':
            phoneme = re.sub('[0-9]', '', orig_phon)
            phon_cmd = '[[inpt PHON]] ' + phoneme
        elif variant == 'high':
            phoneme = orig_phon
            phon_cmd = word
        # elif variant == 'long':
        # if phon != '':
        # self.phone_synth.startSpeakingString_toURL_(phon,d_url)
        # else:
        #     self.synth.startSpeakingString_toURL_(word,d_url)
        fname = dest_filename(word, self.name, self.rate, variant)
        d_path = dest_path(self.name, self.rate, fname)
        # d_url = NSURL.fileURLWithPath_(d_path)
        cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
        return SynthFile(word, phoneme, fname, self.name, self.rate, variant)


def synth_generator():
    voices_installed = NSSpeechSynthesizer.availableVoices()
    voice_attrs = [
        NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
    ]
    # sk = [k for k in voice_attrs[0].keys() if k not in [
    #     'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
    # s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
    #            and 'VoiceRelativeDesirability' in v]
    us_voices_ids = [
        (v['VoiceIdentifier'],
         v['VoiceName'],
         v['VoiceLanguage']) for v in voice_attrs
        #  v['VoiceDemoText'],
        #  v['VoiceShowInFullListOnly'],
        #  v['VoiceRelativeDesirability'])
        if v['VoiceLanguage'] == 'en-US'
        and v['VoiceGender'] != 'VoiceGenderNeuter'
        # and v['VoiceIdentifier'].split('.')[-1][0].isupper()
        # and 'VoiceShowInFullListOnly' in v
        # and 'VoiceRelativeDesirability' in v
    ]
    # import pdb
    # pdb.set_trace()
    # us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',
    #                  'com.apple.speech.synthesis.voice.Alex',
    #                  'com.apple.speech.synthesis.voice.Victoria']
    # voice_rates = list(range(150,221,(220-180)//4))
    voice_rates = [150, 180, 210, 250]
    voice_synths = []
    create_dir(dest_dir)
    for (i, v, l) in us_voices_ids:
        for r in voice_rates:
            s = SynthVariant(i, v, l, r)
            print('Created ', s)
            voice_synths.append(s)

    def synth_for_words(words):
        all_synths = []
        prog_title = "Synthesizing {} words, current word".format(len(words))
        (update, prog) = prog_bar(prog_title)
        for w in prog(words):
            for s in voice_synths:
                for v in ['low', 'medium', 'high']:
                    update('"{}" with {} variant ({})'.format(w, s, v))
                    all_synths.append(s.generate_audio(w, v))
        return all_synths

    return synth_for_words


def write_synths(synth_list, fname, csv=False):
    f = open(fname, 'w')
    if csv:
        for s in synth_list:
            f.write(s.get_csv())
    else:
        json.dump([s.get_json() for s in synth_list], f)
    f.close()


def generate_audio_for_stories():
    # story_file = './inputs/all_stories_hs.json'
    story_file = './inputs/all_stories.json'
    stories_data = json.load(open(story_file))
    # word_list = [t[0] for i in stories_data.values() for t in i]
    word_list = [i for g in stories_data.values() for i in g]
    words_audio_synth = synth_generator()
    return words_audio_synth(word_list)

# words_audio_synth = synth_generator()
# synth = NSSpeechSynthesizer.alloc().init()
# voices_installed = NSSpeechSynthesizer.availableVoices()
# voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in  voices_installed]
# us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
# synth.setVoice_(us_voices_ids[2])
# synth.startSpeakingString_('your')
# fname = dest_filename(word,self.name,self.rate,self.operation)
# d_path = dest_path(fname)
# d_url = dest_url(d_path)


# synths = synth_generator()([OUTPUT_NAME])
synths = generate_audio_for_stories()
write_synths(synths, dest_file, True)
# write_synths(synths,'./outputs/synths.json')
generated voice files using ios api 2017-10-04 12:21:24 +00:00			`import objc`
formatted 2017-10-25 08:06:41 +00:00			`from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty`
			`from AppKit import NSSpeechModePhoneme`
			`from Foundation import NSURL`
generated voice files using ios api 2017-10-04 12:21:24 +00:00			`import json`
implemented phoneme/voice/rate variant genration 2017-10-04 17:51:28 +00:00			`import random`
generated voice files using ios api 2017-10-04 12:21:24 +00:00			`import os`
implemented phoneme/voice/rate variant genration 2017-10-04 17:51:28 +00:00			`import re`
1. using cli say instead of api since api generates empty responses sometimes 2. generating all words voices for each variants 2017-10-05 05:32:38 +00:00			`import subprocess`
refactored sample generation code 2017-10-26 09:57:22 +00:00			`import progressbar`
generated voice files using ios api 2017-10-04 12:21:24 +00:00
refactored sample generation code 2017-10-26 09:57:22 +00:00			`OUTPUT_NAME = 'story_sents'`
formatted 2017-10-25 08:06:41 +00:00			`dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'`
			`dest_file = './outputs/' + OUTPUT_NAME + '.csv'`


refactored sample generation code 2017-10-26 09:57:22 +00:00			`def prog_bar(title):`
			`widgets = [progressbar.FormatLabel(`
			`title), ' [', progressbar.Bar(), '] - ', progressbar.ETA()]`
			`prog = progressbar.ProgressBar(widgets=widgets)`

			`def update_prog(current):`
			`widgets[0] = progressbar.FormatLabel('{} : {}'.format(title, current))`
			`prog.update()`
			`return (update_prog, prog)`


1. implemented spectrogram generator for audio files 2. imported siamese network class (wip) 3. added similarity measure based phoneme neighbor generator 4. fixed samplegen variants code 5. create triplets (wip) 6. updates 2017-10-13 11:10:57 +00:00			`def create_dir(direc):`
refactored 2017-10-05 11:24:41 +00:00			`if not os.path.exists(direc):`
refactored sample generation code 2017-10-26 09:57:22 +00:00			`os.makedirs(direc)`
generated voice files using ios api 2017-10-04 12:21:24 +00:00
formatted 2017-10-25 08:06:41 +00:00
refactored sample generation code 2017-10-26 09:57:22 +00:00			`def dest_filename(w, v, r, t):`
			`return '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, str(random.randint(0, 10000)))`
formatted 2017-10-25 08:06:41 +00:00

			`def dest_path(v, r, n):`
refactored sample generation code 2017-10-26 09:57:22 +00:00			`return dest_dir + v + '/' + str(r) + '/' + n`
formatted 2017-10-25 08:06:41 +00:00

			`def cli_gen_audio(speech_cmd, rate, voice, out_path):`
			`subprocess.call(`
			`['say', '-v', voice, '-r',`
			`str(rate), '-o', out_path, speech_cmd])`

1. using cli say instead of api since api generates empty responses sometimes 2. generating all words voices for each variants 2017-10-05 05:32:38 +00:00
implemented phoneme/voice/rate variant genration 2017-10-04 17:51:28 +00:00			`class SynthFile(object):`
			`"""docstring for SynthFile."""`
formatted 2017-10-25 08:06:41 +00:00
			`def __init__(self, word, phon, filename, voice, rate, operation):`
implemented phoneme/voice/rate variant genration 2017-10-04 17:51:28 +00:00			`super(SynthFile, self).__init__()`
			`self.word = word`
1. implemented spectrogram generator for audio files 2. imported siamese network class (wip) 3. added similarity measure based phoneme neighbor generator 4. fixed samplegen variants code 5. create triplets (wip) 6. updates 2017-10-13 11:10:57 +00:00			`self.phoneme = phon`
implemented phoneme/voice/rate variant genration 2017-10-04 17:51:28 +00:00			`self.filename = filename`
			`self.voice = voice`
			`self.rate = rate`
1. implemented spectrogram generator for audio files 2. imported siamese network class (wip) 3. added similarity measure based phoneme neighbor generator 4. fixed samplegen variants code 5. create triplets (wip) 6. updates 2017-10-13 11:10:57 +00:00			`self.variant = operation`
generated voice files using ios api 2017-10-04 12:21:24 +00:00
implemented phoneme/voice/rate variant genration 2017-10-04 17:51:28 +00:00			`def get_json(self):`
formatted 2017-10-25 08:06:41 +00:00			`return {`
			`'filename': self.filename,`
			`'voice': self.voice,`
			`'rate': self.rate,`
			`'operation': self.operation`
			`}`
implemented phoneme/voice/rate variant genration 2017-10-04 17:51:28 +00:00
			`def get_csv(self):`
formatted 2017-10-25 08:06:41 +00:00			`return '{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,`
			`self.rate, self.variant,`
			`self.filename)`

implemented phoneme/voice/rate variant genration 2017-10-04 17:51:28 +00:00
			`class SynthVariant(object):`
			`"""docstring for SynthVariant."""`
formatted 2017-10-25 08:06:41 +00:00
refactored sample generation code 2017-10-26 09:57:22 +00:00			`def __init__(self, identifier, voice, lang, rate):`
implemented phoneme/voice/rate variant genration 2017-10-04 17:51:28 +00:00			`super(SynthVariant, self).__init__()`
1. using cli say instead of api since api generates empty responses sometimes 2. generating all words voices for each variants 2017-10-05 05:32:38 +00:00			`self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)`
			`self.synth.setVolume_(100)`
			`self.synth.setRate_(rate)`
formatted 2017-10-25 08:06:41 +00:00			`self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(`
			`identifier)`
1. using cli say instead of api since api generates empty responses sometimes 2. generating all words voices for each variants 2017-10-05 05:32:38 +00:00			`self.phone_synth.setVolume_(100)`
			`self.phone_synth.setRate_(rate)`
formatted 2017-10-25 08:06:41 +00:00			`self.phone_synth.setObject_forProperty_error_(`
			`NSSpeechModePhoneme, NSSpeechInputModeProperty, None)`
implemented phoneme/voice/rate variant genration 2017-10-04 17:51:28 +00:00			`self.identifier = identifier`
			`self.rate = rate`
refactored sample generation code 2017-10-26 09:57:22 +00:00			`self.name = voice`
			`self.lang = lang`
			`create_dir(dest_dir + self.name + '/' + str(self.rate))`
implemented phoneme/voice/rate variant genration 2017-10-04 17:51:28 +00:00
1. using cli say instead of api since api generates empty responses sometimes 2. generating all words voices for each variants 2017-10-05 05:32:38 +00:00			`def __repr__(self):`
refactored sample generation code 2017-10-26 09:57:22 +00:00			`return 'Synthesizer[{} - {}]'.format(self.name, self.rate)`
implemented phoneme/voice/rate variant genration 2017-10-04 17:51:28 +00:00
formatted 2017-10-25 08:06:41 +00:00			`def generate_audio(self, word, variant):`
			`orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(`
			`word), '', word`
1. implemented spectrogram generator for audio files 2. imported siamese network class (wip) 3. added similarity measure based phoneme neighbor generator 4. fixed samplegen variants code 5. create triplets (wip) 6. updates 2017-10-13 11:10:57 +00:00			`if variant == 'low':`
			`# self.synth.startSpeakingString_toURL_(word,d_url)`
			`phoneme = orig_phon`
			`elif variant == 'medium':`
formatted 2017-10-25 08:06:41 +00:00			`phoneme = re.sub('[0-9]', '', orig_phon)`
			`phon_cmd = '[[inpt PHON]] ' + phoneme`
1. implemented spectrogram generator for audio files 2. imported siamese network class (wip) 3. added similarity measure based phoneme neighbor generator 4. fixed samplegen variants code 5. create triplets (wip) 6. updates 2017-10-13 11:10:57 +00:00			`elif variant == 'high':`
			`phoneme = orig_phon`
			`phon_cmd = word`
			`# elif variant == 'long':`
formatted 2017-10-25 08:06:41 +00:00			`# if phon != '':`
			`# self.phone_synth.startSpeakingString_toURL_(phon,d_url)`
			`# else:`
			`# self.synth.startSpeakingString_toURL_(word,d_url)`
refactored sample generation code 2017-10-26 09:57:22 +00:00			`fname = dest_filename(word, self.name, self.rate, variant)`
formatted 2017-10-25 08:06:41 +00:00			`d_path = dest_path(self.name, self.rate, fname)`
			`# d_url = NSURL.fileURLWithPath_(d_path)`
			`cli_gen_audio(phon_cmd, self.rate, self.name, d_path)`
			`return SynthFile(word, phoneme, fname, self.name, self.rate, variant)`
implemented phoneme/voice/rate variant genration 2017-10-04 17:51:28 +00:00

			`def synth_generator():`
			`voices_installed = NSSpeechSynthesizer.availableVoices()`
formatted 2017-10-25 08:06:41 +00:00			`voice_attrs = [`
			`NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed`
			`]`
refactored sample generation code 2017-10-26 09:57:22 +00:00			`# sk = [k for k in voice_attrs[0].keys() if k not in [`
			`# 'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]`
			`# s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v`
			`# and 'VoiceRelativeDesirability' in v]`
formatted 2017-10-25 08:06:41 +00:00			`us_voices_ids = [`
refactored sample generation code 2017-10-26 09:57:22 +00:00			`(v['VoiceIdentifier'],`
			`v['VoiceName'],`
			`v['VoiceLanguage']) for v in voice_attrs`
			`# v['VoiceDemoText'],`
			`# v['VoiceShowInFullListOnly'],`
			`# v['VoiceRelativeDesirability'])`
formatted 2017-10-25 08:06:41 +00:00			`if v['VoiceLanguage'] == 'en-US'`
refactored sample generation code 2017-10-26 09:57:22 +00:00			`and v['VoiceGender'] != 'VoiceGenderNeuter'`
			`# and v['VoiceIdentifier'].split('.')[-1][0].isupper()`
			`# and 'VoiceShowInFullListOnly' in v`
			`# and 'VoiceRelativeDesirability' in v`
formatted 2017-10-25 08:06:41 +00:00			`]`
refactored sample generation code 2017-10-26 09:57:22 +00:00			`# import pdb`
			`# pdb.set_trace()`
formatted 2017-10-25 08:06:41 +00:00			`# us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',`
			`# 'com.apple.speech.synthesis.voice.Alex',`
1. included arpabet apple phoneme mapper 2. using only voices with phoneme capability and 3 rates only 2017-10-05 08:28:00 +00:00			`# 'com.apple.speech.synthesis.voice.Victoria']`
			`# voice_rates = list(range(150,221,(220-180)//4))`
formatted 2017-10-25 08:06:41 +00:00			`voice_rates = [150, 180, 210, 250]`
implemented phoneme/voice/rate variant genration 2017-10-04 17:51:28 +00:00			`voice_synths = []`
1. implemented spectrogram generator for audio files 2. imported siamese network class (wip) 3. added similarity measure based phoneme neighbor generator 4. fixed samplegen variants code 5. create triplets (wip) 6. updates 2017-10-13 11:10:57 +00:00			`create_dir(dest_dir)`
refactored sample generation code 2017-10-26 09:57:22 +00:00			`for (i, v, l) in us_voices_ids:`
implemented phoneme/voice/rate variant genration 2017-10-04 17:51:28 +00:00			`for r in voice_rates:`
refactored sample generation code 2017-10-26 09:57:22 +00:00			`s = SynthVariant(i, v, l, r)`
			`print('Created ', s)`
			`voice_synths.append(s)`
formatted 2017-10-25 08:06:41 +00:00
1. using cli say instead of api since api generates empty responses sometimes 2. generating all words voices for each variants 2017-10-05 05:32:38 +00:00			`def synth_for_words(words):`
			`all_synths = []`
refactored sample generation code 2017-10-26 09:57:22 +00:00			`prog_title = "Synthesizing {} words, current word".format(len(words))`
			`(update, prog) = prog_bar(prog_title)`
			`for w in prog(words):`
1. included arpabet apple phoneme mapper 2. using only voices with phoneme capability and 3 rates only 2017-10-05 08:28:00 +00:00			`for s in voice_synths:`
formatted 2017-10-25 08:06:41 +00:00			`for v in ['low', 'medium', 'high']:`
refactored sample generation code 2017-10-26 09:57:22 +00:00			`update('"{}" with {} variant ({})'.format(w, s, v))`
formatted 2017-10-25 08:06:41 +00:00			`all_synths.append(s.generate_audio(w, v))`
1. using cli say instead of api since api generates empty responses sometimes 2. generating all words voices for each variants 2017-10-05 05:32:38 +00:00			`return all_synths`
formatted 2017-10-25 08:06:41 +00:00
1. using cli say instead of api since api generates empty responses sometimes 2. generating all words voices for each variants 2017-10-05 05:32:38 +00:00			`return synth_for_words`
implemented phoneme/voice/rate variant genration 2017-10-04 17:51:28 +00:00
formatted 2017-10-25 08:06:41 +00:00
			`def write_synths(synth_list, fname, csv=False):`
			`f = open(fname, 'w')`
implemented phoneme/voice/rate variant genration 2017-10-04 17:51:28 +00:00			`if csv:`
			`for s in synth_list:`
			`f.write(s.get_csv())`
			`else:`
formatted 2017-10-25 08:06:41 +00:00			`json.dump([s.get_json() for s in synth_list], f)`
implemented phoneme/voice/rate variant genration 2017-10-04 17:51:28 +00:00			`f.close()`
generated voice files using ios api 2017-10-04 12:21:24 +00:00
formatted 2017-10-25 08:06:41 +00:00
generated voice files using ios api 2017-10-04 12:21:24 +00:00			`def generate_audio_for_stories():`
refactored sample generation code 2017-10-26 09:57:22 +00:00			`# story_file = './inputs/all_stories_hs.json'`
			`story_file = './inputs/all_stories.json'`
			`stories_data = json.load(open(story_file))`
			`# word_list = [t[0] for i in stories_data.values() for t in i]`
			`word_list = [i for g in stories_data.values() for i in g]`
1. using cli say instead of api since api generates empty responses sometimes 2. generating all words voices for each variants 2017-10-05 05:32:38 +00:00			`words_audio_synth = synth_generator()`
			`return words_audio_synth(word_list)`
generated voice files using ios api 2017-10-04 12:21:24 +00:00
refactored 2017-10-05 11:24:41 +00:00			`# words_audio_synth = synth_generator()`
			`# synth = NSSpeechSynthesizer.alloc().init()`
			`# voices_installed = NSSpeechSynthesizer.availableVoices()`
			`# voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed]`
			`# us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]`
			`# synth.setVoice_(us_voices_ids[2])`
			`# synth.startSpeakingString_('your')`
			`# fname = dest_filename(word,self.name,self.rate,self.operation)`
			`# d_path = dest_path(fname)`
			`# d_url = dest_url(d_path)`

refactored sample generation code 2017-10-26 09:57:22 +00:00
			`# synths = synth_generator()([OUTPUT_NAME])`
			`synths = generate_audio_for_stories()`
formatted 2017-10-25 08:06:41 +00:00			`write_synths(synths, dest_file, True)`
1. using cli say instead of api since api generates empty responses sometimes 2. generating all words voices for each variants 2017-10-05 05:32:38 +00:00			`# write_synths(synths,'./outputs/synths.json')`