import objc from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty from AppKit import NSSpeechModePhoneme from Foundation import NSURL import json import csv import random import os import re import subprocess import time from tqdm import tqdm from generate_similar import similar_phoneme_phrase,similar_phrase from speech_tools import hms_string,create_dir,format_filename,reservoir_sample OUTPUT_NAME = 'test_5_words' dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/' dest_file = './outputs/' + OUTPUT_NAME + '.csv' def dest_filename(w, v, r, t): rand_no = str(random.randint(0, 10000)) fname = '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, rand_no) sanitized = format_filename(fname) return sanitized def dest_path(v, r, n): rel = v + '/' + str(r) + '/' + n return (dest_dir + rel), rel def cli_gen_audio(speech_cmd, rate, voice, out_path): subprocess.call( ['say', '-v', voice, '-r', str(rate), '-o', out_path, "'"+speech_cmd+"'"]) class SynthFile(object): """docstring for SynthFile.""" def __init__(self, word, phon, filename, voice, voice_lang, rate, operation): super(SynthFile, self).__init__() self.word = word self.phoneme = phon self.filename = filename self.voice = voice self.voice_lang = voice_lang self.rate = rate self.variant = operation def get_json(self): return { 'filename': self.filename, 'voice': self.voice, 'rate': self.rate, 'operation': self.operation } def get_csv(self): cols = [self.word, self.phoneme, self.voice, self.voice_lang, self.rate, self.variant, self.filename] return ','.join([str(c) for c in cols])+'\n' def get_values(self): cols = [self.word, self.phoneme, self.voice, self.voice_lang, self.rate, self.variant, self.filename] return [str(c) for c in cols] class SynthVariant(object): """docstring for SynthVariant.""" def __init__(self, identifier, voice, lang, rate): super(SynthVariant, self).__init__() self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier) self.synth.setVolume_(100) self.synth.setRate_(rate) self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_( identifier) self.phone_synth.setVolume_(100) self.phone_synth.setRate_(rate) self.phone_synth.setObject_forProperty_error_( NSSpeechModePhoneme, NSSpeechInputModeProperty, None) self.identifier = identifier self.rate = rate self.name = voice self.lang = lang self.phoneme_capable = self.is_phoneme_capable() def __repr__(self): return 'Synthesizer[{} - {}]'.format(self.name, self.rate) def is_phoneme_capable(self): orig_phon = self.synth.phonemesFromText_('water') return orig_phon != '' def generate_audio(self, text, variant): orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_( text), '', text if variant == 'low': # self.synth.startSpeakingString_toURL_(word,d_url) phoneme = orig_phon elif variant == 'medium': phoneme = similar_phoneme_phrase(orig_phon) phon_cmd = '[[inpt PHON]] ' + phoneme elif variant == 'high': phoneme = similar_phrase(text) phon_cmd = phoneme # elif variant == 'long': # if phon != '': # self.phone_synth.startSpeakingString_toURL_(phon,d_url) # else: # self.synth.startSpeakingString_toURL_(word,d_url) fname = dest_filename(text, self.name, self.rate, variant) d_path, r_path = dest_path(self.name, self.rate, fname) # d_url = NSURL.fileURLWithPath_(d_path) cli_gen_audio(phon_cmd, self.rate, self.name, d_path) return SynthFile(text, phoneme, r_path, self.name, self.lang, self.rate, variant) def create_synth_dirs(self): if self.phoneme_capable: create_dir(dest_dir + self.name + '/' + str(self.rate)) @staticmethod def voices_for_lang(lang): voices_installed = NSSpeechSynthesizer.availableVoices() voice_attrs = [ NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed ] # sk = [k for k in voice_attrs[0].keys() if k not in [ # 'VoiceIndividuallySpokenCharacters', 'VoiceSupportedCharacters']] # s_attrs = [[v[i] for i in sk] for v in voice_attrs if 'VoiceShowInFullListOnly' in v # and 'VoiceRelativeDesirability' in v] return [ (v['VoiceIdentifier'], v['VoiceName'], v['VoiceLanguage']) for v in voice_attrs if v['VoiceLanguage'] == lang and v['VoiceGender'] != 'VoiceGenderNeuter' ] @classmethod def synth_with(cls,voice_params,rate=180): identifier,voice,lang = voice_params return cls(identifier,voice,lang,rate) def synth_generator(): us_voices_ids = SynthVariant.voices_for_lang('en-US') voice_rates = [150, 180, 210]#, 250] voice_synths = [] create_dir(dest_dir) for vp in us_voices_ids: for r in voice_rates: s = SynthVariant.synth_with(vp,r) if s.phoneme_capable: print('Adding ', s) voice_synths.append(s) else: print('Discarding phoneme incapable ', s) def synth_for_words(words, writer): start_time = time.time() prog_title = "Synthesizing {} words : ".format(len(words)) for s in voice_synths: s.create_synth_dirs() for v in ['low', 'medium', 'high']: prog = tqdm(words) prog.set_postfix(variant=v,voice=s.name,rate=s.rate) for w in tqdm(words): prog.set_description('Synthesizing text:"{}"'.format(w)) synthed = s.generate_audio(w, v) writer(synthed) prog.close() end_time = time.time() time_str = hms_string(end_time - start_time) print("It took {} to synthsize all variants.".format(time_str)) return synth_for_words def synth_logger(fname, csv_mode=False): f = open(fname, 'w') s_csv_w = csv.writer(f, quoting=csv.QUOTE_MINIMAL) def csv_writer(s): s_csv_w.writerow(s.get_values()) synth_list = [] def json_writer(s): synth_list.append(s) def close_file(): if csv_mode: f.close() else: json.dump([s.get_json() for s in synth_list], f) f.close() if csv_mode: return csv_writer, close_file else: return json_writer, close_file def generate_audio_for_text_list(text_list): (writer, closer) = synth_logger(dest_file, csv_mode=True) synth_for_texts = synth_generator() try: synth_for_texts(text_list, writer) except: import traceback import sys traceback.print_exc(file=sys.stdout) pass closer() def generate_audio_for_stories(): ''' Generates the audio sample variants for the list of words in the stories ''' # story_file = './inputs/all_stories_hs.json' story_file = './inputs/all_stories.json' stories_data = json.load(open(story_file)) # text_list_dup = [t[0] for i in stories_data.values() for t in i] text_list_dup = [t for i in stories_data.values() for t in i] text_list = sorted(list(set(text_list_dup))) generate_audio_for_text_list(text_list) def generate_test_audio_for_stories(sample_count=0): ''' Picks a list of words from the wordlist that are not in story words and generates the variants ''' story_file = './inputs/all_stories_hs.json' # story_file = './inputs/all_stories.json' stories_data = json.load(open(story_file)) text_list_dup = [t[0] for i in stories_data.values() for t in i] text_list = sorted(list(set(text_list_dup))) # text_list = [i.replace('-','') for g in stories_data.values() for i in g] word_list = [i.strip('\n_') for i in open('./inputs/wordlist.txt','r').readlines()] text_set = set(text_list) new_word_list = [i for i in word_list if i not in text_set and len(i) > 4] # test_words = new_word_list[:int(len(text_list)/5+1)] test_words = reservoir_sample(new_word_list,sample_count) if sample_count > 0 else new_word_list generate_audio_for_text_list(test_words) if __name__ == '__main__': generate_test_audio_for_stories(5) # generate_audio_for_text_list(['I want to go home','education']) # generate_audio_for_stories()