speech-scoring/tts-wav-gen.py

import objc
from AppKit import NSSpeechSynthesizer,NSSpeechInputModeProperty,NSSpeechModePhoneme
from Foundation import NSURL,NSError
import json
import random
import os
import re


dest_filename = lambda p: p+str(random.randint(0,10000))+'.aiff'
dest_path = lambda p: os.path.abspath('.')+'/outputs/audio/'+p
dest_url = lambda p: NSURL.fileURLWithPath_(dest_path(p))

class SynthFile(object):
    """docstring for SynthFile."""
    def __init__(self,word, filename,voice,rate,operation):
        super(SynthFile, self).__init__()
        self.word = word
        self.filename = filename
        self.voice = voice
        self.rate = rate
        self.operation = operation

    def get_json(self):
        return {'filename':self.filename,'voice':self.voice,
                'rate':self.rate,'operation':self.operation}

    def get_csv(self):
        return '{},{},{},{},{}\n'.format(self.word,self.voice,self.rate,self.operation,self.filename)

class SynthVariant(object):
    """docstring for SynthVariant."""
    def __init__(self,identifier,rate,op):
        super(SynthVariant, self).__init__()
        sp = NSSpeechSynthesizer.alloc().init()
        sp.setVolume_(100)
        sp.setVoice_(identifier)
        sp.setRate_(rate)
        self.synth = sp
        p_syn = NSSpeechSynthesizer.alloc().init()
        p_syn.setVolume_(100)
        p_syn.setVoice_(identifier)
        p_syn.setRate_(rate)
        p_syn.setObject_forProperty_error_(NSSpeechModePhoneme,NSSpeechInputModeProperty,None)
        self.phone_synth = p_syn
        self.identifier = identifier
        self.rate = rate
        self.name = identifier.split('.')[-1]
        self.operation = op


    def synth_file(self,word):
        fname = dest_filename(word)
        d_url = dest_url(fname)
        if self.operation == 'normal':
            self.synth.startSpeakingString_toURL_(word,d_url)
        else:
            orig_phon = self.synth.phonemesFromText_(word)
            phon = re.sub('[0-9]','',orig_phon)
            self.phone_synth.startSpeakingString_toURL_(phon,d_url)
        return SynthFile(word,fname,self.name,self.rate,self.operation)


def synth_generator():
    voices_installed = NSSpeechSynthesizer.availableVoices()
    voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in  voices_installed]
    us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US']
    voice_rates = list(range(180,221,(220-180)//5))
    voice_synths = []
    variants = ['normal','phoneme']
    for v in us_voices_ids:
        for r in voice_rates:
            for o in variants:
                voice_synths.append(SynthVariant(v,r,o))
    def synth_for_word(word):
        return [s.synth_file(word) for s in voice_synths]
    return synth_for_word

def write_synths(synth_list,fname,csv=False):
    f = open(fname,'w')
    if csv:
        for s in synth_list:
            f.write(s.get_csv())
    else:
        json.dump([s.get_json() for s in synth_list],f)
    f.close()

def generate_audio_for_stories():
    stories_data = json.load(open('./inputs/all_stories_hs.json'))
    word_list = [t[0] for i in stories_data.values() for t in i]
    word_audio_synth = synth_generator()
    all_synths = []
    for word in word_list:
        words_synths = word_audio_synth(word)
        all_synths.extend(words_synths)
    return all_synths

# synths = synth_generator()('education')
synths = generate_audio_for_stories()
write_synths(synths,'./outputs/synth_data.csv',True)
write_synths(synths,'./outputs/synths.json')