speech-scoring/tts_samplegen.py

import objc
from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty
from AppKit import NSSpeechModePhoneme
from Foundation import NSURL
import json
import random
import os
import re
import subprocess
import progressbar

OUTPUT_NAME = 'story_sents'
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
dest_file = './outputs/' + OUTPUT_NAME + '.csv'


def prog_bar(title):
    widgets = [title,progressbar.Counter(),'th entry - ', progressbar.FormatLabel(
        ''), ' [', progressbar.Bar(), '] - ', progressbar.ETA()]
    prog = progressbar.ProgressBar(widgets=widgets)

    def update_prog(current):
        widgets[3] = progressbar.FormatLabel(current)
        prog.update()
    return (update_prog, prog)


def create_dir(direc):
    if not os.path.exists(direc):
        os.makedirs(direc)


def dest_filename(w, v, r, t):
    return '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, str(random.randint(0, 10000)))


def dest_path(v, r, n):
    rel = v + '/' + str(r) + '/' + n
    return (dest_dir + rel), rel


def cli_gen_audio(speech_cmd, rate, voice, out_path):
    subprocess.call(
        ['say', '-v', voice, '-r',
         str(rate), '-o', out_path, speech_cmd])


class SynthFile(object):
    """docstring for SynthFile."""

    def __init__(self, word, phon, filename, voice, rate, operation):
        super(SynthFile, self).__init__()
        self.word = word
        self.phoneme = phon
        self.filename = filename
        self.voice = voice
        self.rate = rate
        self.variant = operation

    def get_json(self):
        return {
            'filename': self.filename,
            'voice': self.voice,
            'rate': self.rate,
            'operation': self.operation
        }

    def get_csv(self):
        return '{},{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
                                            self.rate, self.variant,
                                            self.filename)


class SynthVariant(object):
    """docstring for SynthVariant."""

    def __init__(self, identifier, voice, lang, rate):
        super(SynthVariant, self).__init__()
        self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
        self.synth.setVolume_(100)
        self.synth.setRate_(rate)
        self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(
            identifier)
        self.phone_synth.setVolume_(100)
        self.phone_synth.setRate_(rate)
        self.phone_synth.setObject_forProperty_error_(
            NSSpeechModePhoneme, NSSpeechInputModeProperty, None)
        self.identifier = identifier
        self.rate = rate
        self.name = voice
        self.lang = lang
        create_dir(dest_dir + self.name + '/' + str(self.rate))

    def __repr__(self):
        return 'Synthesizer[{} - {}]'.format(self.name, self.rate)

    def generate_audio(self, word, variant):
        orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(
            word), '', word
        if variant == 'low':
            # self.synth.startSpeakingString_toURL_(word,d_url)
            phoneme = orig_phon
        elif variant == 'medium':
            phoneme = re.sub('[0-9]', '', orig_phon)
            phon_cmd = '[[inpt PHON]] ' + phoneme
        elif variant == 'high':
            phoneme = orig_phon
            phon_cmd = word
        # elif variant == 'long':
        # if phon != '':
        # self.phone_synth.startSpeakingString_toURL_(phon,d_url)
        # else:
        #     self.synth.startSpeakingString_toURL_(word,d_url)
        fname = dest_filename(word, self.name, self.rate, variant)
        d_path, r_path = dest_path(self.name, self.rate, fname)
        # d_url = NSURL.fileURLWithPath_(d_path)
        cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
        return SynthFile(word, phoneme, r_path, self.name, self.rate, variant)


def synth_generator():
    voices_installed = NSSpeechSynthesizer.availableVoices()
    voice_attrs = [
        NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
    ]
    # sk = [k for k in voice_attrs[0].keys() if k not in [
    #     'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']]
    # s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v
    #            and 'VoiceRelativeDesirability' in v]
    us_voices_ids = [
        (v['VoiceIdentifier'],
         v['VoiceName'],
         v['VoiceLanguage']) for v in voice_attrs
        #  v['VoiceDemoText'],
        #  v['VoiceShowInFullListOnly'],
        #  v['VoiceRelativeDesirability'])
        if v['VoiceLanguage'] == 'en-US'
        and v['VoiceGender'] != 'VoiceGenderNeuter'
        # and v['VoiceIdentifier'].split('.')[-1][0].isupper()
        # and 'VoiceShowInFullListOnly' in v
        # and 'VoiceRelativeDesirability' in v
    ]
    # import pdb
    # pdb.set_trace()
    # us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',
    #                  'com.apple.speech.synthesis.voice.Alex',
    #                  'com.apple.speech.synthesis.voice.Victoria']
    # voice_rates = list(range(150,221,(220-180)//4))
    voice_rates = [150, 180, 210, 250]
    voice_synths = []
    create_dir(dest_dir)
    for (i, v, l) in us_voices_ids:
        for r in voice_rates:
            s = SynthVariant(i, v, l, r)
            print('Created ', s)
            voice_synths.append(s)

    def synth_for_words(words, writer):
        # all_synths = []
        prog_title = "Synthesizing {} words : ".format(len(words))
        (update, prog) = prog_bar(prog_title)
        for w in prog(words):
            for s in voice_synths:
                for v in ['low', 'medium', 'high']:
                    update('"{}" with {} variant ({})'.format(w, s, v))
                    synthed = s.generate_audio(w, v)
                    writer(synthed)
                    # all_synths.append(synthed)
        # return all_synths

    return synth_for_words


def write_synths(synth_list, fname, csv=False):
    f = open(fname, 'w')
    if csv:
        for s in synth_list:
            f.write(s.get_csv())
    else:
        json.dump([s.get_json() for s in synth_list], f)
    f.close()


def synth_logger(fname, csv=False):
    f = open(fname, 'w')

    def csv_writer(s):
        f.write(s.get_csv())
    synth_list = []

    def json_writer(s):
        synth_list.append(s)

    def close_file():
        if csv:
            f.close()
        else:
            json.dump([s.get_json() for s in synth_list], f)
            f.close()
    if csv:
        return csv_writer, close_file
    else:
        return json_writer, close_file


def generate_audio_for_stories():
    # story_file = './inputs/all_stories_hs.json'
    story_file = './inputs/all_stories.json'
    stories_data = json.load(open(story_file))
    # word_list = [t[0] for i in stories_data.values() for t in i]
    word_list = [i for g in stories_data.values() for i in g]
    (writer, closer) = synth_logger(dest_file, csv=True)
    synth_for_words = synth_generator()
    try:
        synth_for_words(word_list, writer)
    except:
        import traceback
        import sys
        traceback.print_exc(file=sys.stdout)
        pass
    closer()

# words_audio_synth = synth_generator()
# synth = NSSpeechSynthesizer.alloc().init()
# voices_installed = NSSpeechSynthesizer.availableVoices()
# voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in  voices_installed]
# us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
# synth.setVoice_(us_voices_ids[2])
# synth.startSpeakingString_('your')
# fname = dest_filename(word,self.name,self.rate,self.operation)
# d_path = dest_path(fname)
# d_url = dest_url(d_path)


# synths = synth_generator()([OUTPUT_NAME])
generate_audio_for_stories()
# write_synths(synths, dest_file, True)
# write_synths(synths,'./outputs/synths.json')