speech-scoring/tts_samplegen.py

import objc
from AppKit import NSSpeechSynthesizer,NSSpeechInputModeProperty,NSSpeechModePhoneme
from Foundation import NSURL,NSError,NSObject
import json
import random
import os
import re
import subprocess


OUTPUT_NAME = 'audio'
dest_dir = os.path.abspath('.')+'/outputs/'+OUTPUT_NAME+'/'
dest_file = './outputs/'+OUTPUT_NAME+'.csv'
def create_dir(direc):
    if not os.path.exists(direc):
        os.mkdir(direc)
dest_filename = lambda n,v,r,t: '{}-{}-{}-{}-'.format(n,v,r,t)+str(random.randint(0,10000))+'.aiff'
dest_path = lambda v,r,n: dest_dir+v+'/'+r+'/'+n
dest_url = lambda p: NSURL.fileURLWithPath_(p)

def cli_gen_audio(speech_cmd,rate,voice,out_path):
    subprocess.call(['say','-v',voice,'-r',str(rate),'-o',out_path,speech_cmd])

class SynthFile(object):
    """docstring for SynthFile."""
    def __init__(self,word,phon, filename,voice,rate,operation):
        super(SynthFile, self).__init__()
        self.word = word
        self.phoneme = phon
        self.filename = filename
        self.voice = voice
        self.rate = rate
        self.variant = operation

    def get_json(self):
        return {'filename':self.filename,'voice':self.voice,
                'rate':self.rate,'operation':self.operation}

    def get_csv(self):
        return '{},{},{},{},{}\n'.format(self.word,self.phoneme,self.voice,self.rate,self.variant,self.filename)

class SynthVariant(object):
    """docstring for SynthVariant."""
    def __init__(self,identifier,rate):
        super(SynthVariant, self).__init__()
        self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
        self.synth.setVolume_(100)
        self.synth.setRate_(rate)
        self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
        self.phone_synth.setVolume_(100)
        self.phone_synth.setRate_(rate)
        self.phone_synth.setObject_forProperty_error_(NSSpeechModePhoneme,NSSpeechInputModeProperty,None)
        self.identifier = identifier
        self.rate = rate
        self.name = identifier.split('.')[-1]

    def __repr__(self):
        return 'Synthesizer[{} - {}]({})'.format(self.name,self.rate)

    def generate_audio(self,word,variant):
        orig_phon,phoneme,phon_cmd = self.synth.phonemesFromText_(word),'',word
        if variant == 'low':
            # self.synth.startSpeakingString_toURL_(word,d_url)
            phoneme = orig_phon
        elif variant == 'medium':
            phoneme = re.sub('[0-9]','',orig_phon)
            phon_cmd = '[[inpt PHON]] '+phoneme
        elif variant == 'high':
            phoneme = orig_phon
            phon_cmd = word
        # elif variant == 'long':
            # if phon != '':
            # self.phone_synth.startSpeakingString_toURL_(phon,d_url)
            # else:
            #     self.synth.startSpeakingString_toURL_(word,d_url)
        fname = dest_filename(word,phoneme,self.name,self.rate)
        d_path = dest_path(self.name,self.rate,fname)
        d_url = dest_url(d_path)
        cli_gen_audio(phon_cmd,self.rate,self.name,d_path)
        return SynthFile(word,phoneme,fname,self.name,self.rate,variant)


def synth_generator():
    voices_installed = NSSpeechSynthesizer.availableVoices()
    voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in  voices_installed]
    us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
    # us_voices_ids = ['com.apple.speech.synthesis.voice.Fred','com.apple.speech.synthesis.voice.Alex',
    #                  'com.apple.speech.synthesis.voice.Victoria']
    # voice_rates = list(range(150,221,(220-180)//4))
    voice_rates = [150,180,210,250]
    voice_synths = []
    create_dir(dest_dir)
    for v in us_voices_ids:
        for r in voice_rates:
            create_dir(dest_dir+v+'/'+r)
            voice_synths.append(SynthVariant(v,r))
    def synth_for_words(words):
        all_synths = []
        for w in words:
            for s in voice_synths:
                for v in ['low','medium','high']:
                    all_synths.append(s.generate_audio(w,v))
        return all_synths
    return synth_for_words

def write_synths(synth_list,fname,csv=False):
    f = open(fname,'w')
    if csv:
        for s in synth_list:
            f.write(s.get_csv())
    else:
        json.dump([s.get_json() for s in synth_list],f)
    f.close()

def generate_audio_for_stories():
    stories_data = json.load(open('./inputs/all_stories_hs.json'))
    word_list = [t[0] for i in stories_data.values() for t in i]
    words_audio_synth = synth_generator()
    return words_audio_synth(word_list)

# words_audio_synth = synth_generator()
# synth = NSSpeechSynthesizer.alloc().init()
# voices_installed = NSSpeechSynthesizer.availableVoices()
# voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in  voices_installed]
# us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
# synth.setVoice_(us_voices_ids[2])
# synth.startSpeakingString_('your')
# fname = dest_filename(word,self.name,self.rate,self.operation)
# d_path = dest_path(fname)
# d_url = dest_url(d_path)

synths = synth_generator()([OUTPUT_NAME])
# synths = generate_audio_for_stories()
write_synths(synths,dest_file,True)
# write_synths(synths,'./outputs/synths.json')