formatted

2017-10-25 13:36:41 +05:30
parent e6f0c8b21b
commit 82d0398d2c
8 changed files with 275 additions and 252 deletions
--- a/arpabet-to-apple.py
+++ b/arpabet-to-apple.py
@@ -1,55 +0,0 @@
 #!/usr/bin/env python3
 """
 Convert ARPABET <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>
 to Apple's codes <https://developer.apple.com/library/content/documentation/UserExperience/Conceptual/SpeechSynthesisProgrammingGuide/Phonemes/Phonemes.html>
 """
 import sys
 mapping = {s.split()[0]: s.split()[1] for s in """
 AA AA
 AE AE
 AH UX
 AO AO
 AW AW
 AY AY
 B  b
 CH C
 D  d
 DH D
 EH EH
 ER UXr
 EY EY
 F  f
 G  g
 HH h
 IH IH
 IY IY
 JH J
 K  k
 L  l
 M  m
 N  n
 NG N
 OW OW
 OY OY
 P  p
 R  r
 S  s
 SH S
 T  t
 TH T
 UH UH
 UW UW
 V  v
 W  w
 Y  y
 Z  z
 ZH Z
 """.strip().split('\n')}
 arpabet_phonemes = sys.stdin.read().split()
 apple_phonemes = [mapping[p.upper()] for p in arpabet_phonemes]
 print('[[inpt PHON]] ' + ''.join(apple_phonemes))
--- a/create_triplets.py
+++ b/create_triplets.py
@@ -1,10 +0,0 @@
 import pandas as pd
 audio_file = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','type','filename'])
 word_goups = audio_file.groupby('word')
 # audio
 lst = [1, 2, 3, 1, 2, 3]
 s = pd.Series([1, 2, 3, 10, 20, 30], lst)
 df3 = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
 s.groupby(level=0).sum()
--- a/record_mic_speech.py
+++ b/record_mic_speech.py
@@ -2,14 +2,24 @@ import pyaudio
 import numpy as np
 from matplotlib import pyplot as plt
-CHUNKSIZE = 1024 # fixed chunk size
+CHUNKSIZE = 44100 * 10  # fixed chunk size
 # initialize portaudio
-p = pyaudio.PyAudio()
+p_inp = pyaudio.PyAudio()
-stream = p.open(format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=CHUNKSIZE)
+# dev_n = p.get_device_count()
 # dev_infos = [p.get_device_info_by_index(index) for index in range(dev_n)]
 # [i for i in dev_infos] # if i['name'] == 'record']
 stream = p_inp.open(
    format=pyaudio.paInt24,
    channels=2,
    rate=44100,
    input=True,
    frames_per_buffer=CHUNKSIZE)
 # do this as long as you want fresh samples
 data = stream.read(CHUNKSIZE)
 len(data)
 CHUNKSIZE*10
 numpydata = np.fromstring(data, dtype=np.int16)
 # plot data
@@ -19,4 +29,27 @@ plt.show()
 # close stream
 stream.stop_stream()
 stream.close()
-p.terminate()
+p_inp.terminate()
 # open the file for reading.
 # wf = wave.open(sys.argv[1], 'rb')
 # create an audio object
 # p = pyaudio.PyAudio()
 # open stream based on the wave object which has been input.
 p_oup = pyaudio.PyAudio()
 stream = p_oup.open(
    format=pyaudio.paInt24, channels=2, rate=44100, output=True)
 # read data (based on the chunk size)
 # data = wf.readframes(CHUNKSIZE)
 # play stream (looping from beginning of file to the end)
 # while data != '':
 # writing to the stream is what *actually* plays the sound.
 stream.write(data)
 # data = wf.readframes(chunk)
 # cleanup stuff.
 stream.close()
 p_oup.terminate()
--- a/snippets.py
+++ b/snippets.py
@@ -1,12 +0,0 @@
 # import scipy.signal as sg
 # import pysndfile.sndio as snd
 #
 # snd_data,samples,_ = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')
 # samples_per_seg = 3*int(samples*150/(3*1000))
 # # samples/(len(snd_data)*1000.0)
 # len(snd_data)
 # samples_per_seg/2
 #
 # len(sg.spectrogram(snd_data,nperseg=samples_per_seg,noverlap=samples_per_seg/3)[2])
 #
 # from spectro_gen import generate_aiff_spectrogram
--- a/spectro_gen.py
+++ b/spectro_gen.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
-#coding: utf-8
+
-""" This work is licensed under a Creative Commons Attribution 3.0 Unported License.
+""" This work is licensed under a Creative Commons Attribution 3.0 Unported
    License.
    Frank Zalkow, 2012-2013
    http://www.frank-zalkow.de/en/code-snippets/create-audio-spectrograms-with-python.html?i=1
 """
@@ -9,8 +10,9 @@ import numpy as np
 from matplotlib import pyplot as plt
 from pysndfile import sndio as snd
 from numpy.lib import stride_tricks
 """ short time fourier transform of audio signal """
 def stft(sig, frameSize, overlapFac=0.5, window=np.hanning):
    win = window(frameSize)
    hopSize = int(frameSize - np.floor(overlapFac * frameSize))
@@ -26,12 +28,18 @@ def stft(sig, frameSize, overlapFac=0.5, window=np.hanning):
    # zeros at end (thus samples can be fully covered by frames)
    samples = np.append(samples, np.zeros(frameSize))
-    frames = stride_tricks.as_strided(samples, shape=(cols, frameSize), strides=(samples.strides[0]*hopSize, samples.strides[0])).copy()
+    frames = stride_tricks.as_strided(
        samples,
        shape=(cols, frameSize),
        strides=(samples.strides[0] * hopSize, samples.strides[0])).copy()
    frames *= win
    return np.fft.rfft(frames)
 """ scale frequency axis logarithmically """
 def logscale_spec(spec, sr=44100, factor=20.):
    timebins, freqbins = np.shape(spec)
@@ -58,7 +66,10 @@ def logscale_spec(spec, sr=44100, factor=20.):
    return newspec, freqs
 """ generate spectrogram for aiff audio with 150ms windows and 50ms overlap"""
 def generate_aiff_spectrogram(audiopath):
    samples, samplerate, _ = snd.read(audiopath)
    # samplerate, samples = wav.read(audiopath)
@@ -69,7 +80,10 @@ def generate_aiff_spectrogram(audiopath):
    ims = 20. * np.log10(np.abs(sshow) / 10e-6)
    return ims
 """ plot spectrogram"""
 def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
    samples, samplerate, _ = snd.read(audiopath)
    # samplerate, samples = wav.read(audiopath)
@@ -83,7 +97,12 @@ def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
    timebins, freqbins = np.shape(ims)
    # import pdb;pdb.set_trace()
    plt.figure(figsize=(15, 7.5))
-    plt.imshow(np.transpose(ims), origin="lower", aspect="auto", cmap=colormap, interpolation="none")
+    plt.imshow(
        np.transpose(ims),
        origin="lower",
        aspect="auto",
        cmap=colormap,
        interpolation="none")
    plt.colorbar()
    plt.xlabel("time (s)")
@@ -92,7 +111,11 @@ def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
    plt.ylim([0, freqbins])
    xlocs = np.float32(np.linspace(0, timebins - 1, 5))
-    plt.xticks(xlocs, ["%.02f" % l for l in ((xlocs*len(samples)/timebins)+(0.5*binsize))/samplerate])
+    plt.xticks(xlocs, [
        "%.02f" % l
        for l in (
            (xlocs * len(samples) / timebins) + (0.5 * binsize)) / samplerate
    ])
    ylocs = np.int16(np.round(np.linspace(0, freqbins - 1, 10)))
    plt.yticks(ylocs, ["%.02f" % freq[i] for i in ylocs])
@@ -103,6 +126,13 @@ def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
    plt.clf()
 snd.get_info('./outputs/sunflowers-Alex-150-normal-589.aiff')
 snd_data_arr = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
 snd_data = snd_data_arr.tobytes()
 snd_data_arr.dtype
 len(snd_data)
 if __name__ == '__main__':
    plotstft('./outputs/sunflowers-Alex-150-normal-589.aiff')
    plotstft('./outputs/sunflowers-Alex-180-normal-4763.aiff')
--- a/speech_data.py
+++ b/speech_data.py
@@ -3,37 +3,28 @@ import numpy as np
 from spectro_gen import generate_aiff_spectrogram
 from sklearn.model_selection import train_test_split
 import itertools
-import pickle,gc
+import gc
 def sunflower_data():
    audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file'])
    sunflowers = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
    sunflowers.loc[:,'file'] = sunflowers.loc[:,'file'].apply(lambda x:'outputs/'+x).apply(generate_aiff_spectrogram)
    y_data = sunflowers['variant'].apply(lambda x:x=='normal').values
    max_samples = sunflowers['file'].apply(lambda x:x.shape[0]).max()
    sample_size = sunflowers['file'][0].shape[1]
    sample_count = sunflowers['file'].shape[0]
    sunflowers['file'][0].shape[0]
    def append_zeros(spgr):
        orig = spgr.shape[0]
        return np.lib.pad(spgr,[(0, max_samples-orig), (0,0)],'median')
    pad_sun = sunflowers['file'].apply(append_zeros).values
    x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size,))
    return train_test_split(x_data,y_data,test_size=0.33)
 def get_siamese_pairs(groupF1, groupF2):
    group1 = [r for (i, r) in groupF1.iterrows()]
    group2 = [r for (i, r) in groupF2.iterrows()]
    f = [(g1, g2) for g2 in group2 for g1 in group1]
-    t = [i for i in itertools.combinations(group1,2)]+[i for i in itertools.combinations(group2,2)]
+    t = [i for i in itertools.combinations(group1, 2)
         ] + [i for i in itertools.combinations(group2, 2)]
    return (t, f)
 def sunflower_pairs_data():
-    audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file'])
+    audio_samples = pd.read_csv(
-    audio_samples = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
+        './outputs/audio.csv',
-    audio_samples.loc[:,'spectrogram'] = audio_samples.loc[:,'file'].apply(lambda x:'outputs/audio/'+x).apply(generate_aiff_spectrogram)
+        names=['word', 'voice', 'rate', 'variant', 'file'])
-    max_samples = audio_samples['spectrogram'].apply(lambda x:x.shape[0]).max()
+    audio_samples = audio_samples.loc[audio_samples['word'] ==
-    sample_size = audio_samples['spectrogram'][0].shape[1]
+                                      'sunflowers'].reset_index(drop=True)
    audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply(
        lambda x: 'outputs/audio/' + x).apply(generate_aiff_spectrogram)
    max_samples = audio_samples['spectrogram'].apply(
        lambda x: x.shape[0]).max()
    same_data, diff_data = [], []
    for (w, g) in audio_samples.groupby(audio_samples['word']):
        sample_norm = g.loc[audio_samples['variant'] == 'normal']
@@ -43,34 +34,49 @@ def sunflower_pairs_data():
        diff_data.extend(diff)
    Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
    X_sample_pairs = same_data + diff_data
    def append_zeros(spgr):
-        sample = np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median')
+        sample = np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
                            'median')
        return np.expand_dims(sample, axis=0)
    def create_X(sp):
        # sample_count = sp[0]['file'].shape[0]
        l_sample = append_zeros(sp[0]['spectrogram'])
-        r_sample = append_zeros(sp[1]['spectrogram'])#.apply(append_zeros).values
+        r_sample = append_zeros(
-        # x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size))
+            sp[1]['spectrogram'])
        return np.expand_dims(np.vstack([l_sample, r_sample]), axis=0)
    X_list = (create_X(sp) for sp in X_sample_pairs)
    X = np.vstack(X_list)
    tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
    return train_test_split(X, Y, test_size=0.1)
 def create_spectrogram_data(audio_group='audio'):
-    audio_samples = pd.read_csv('./outputs/'+audio_group+'.csv',names=['word','voice','rate','variant','file'])
+    audio_samples = pd.read_csv(
-    # audio_samples = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
+        './outputs/' + audio_group + '.csv',
-    audio_samples.loc[:,'spectrogram'] = audio_samples.loc[:,'file'].apply(lambda x:'outputs/'+audio_group+'/'+x).apply(generate_aiff_spectrogram)
+        names=['word', 'voice', 'rate', 'variant', 'file'])
    # audio_samples = audio_samples.loc[audio_samples['word'] ==
    #                                   'sunflowers'].reset_index(drop=True)
    audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply(
        lambda x: 'outputs/' + audio_group + '/' + x).apply(
            generate_aiff_spectrogram)
    audio_samples.to_pickle('outputs/spectrogram.pkl')
 def create_speech_pairs_data(audio_group='audio'):
    audio_samples = pd.read_pickle('outputs/spectrogram.pkl')
-    max_samples = audio_samples['spectrogram'].apply(lambda x:x.shape[0]).max()
+    max_samples = audio_samples['spectrogram'].apply(
-    sample_size = audio_samples['spectrogram'][0].shape[1]
+        lambda x: x.shape[0]).max()
    # sample_size = audio_samples['spectrogram'][0].shape[1]
    def append_zeros(spgr):
-        sample = np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median')
+        sample = np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
                            'median')
        return sample
    def create_X(sp):
        l_sample = append_zeros(sp[0]['spectrogram'])
        r_sample = append_zeros(sp[1]['spectrogram'])
@@ -79,8 +85,8 @@ def create_speech_pairs_data(audio_group='audio'):
    print('generating siamese speech pairs')
    same_data, diff_data = [], []
    for (w, g) in audio_samples.groupby(audio_samples['word']):
-        sample_norm = g.loc[audio_samples['variant'] == 'normal']#.reset_index(drop=True)
+        sample_norm = g.loc[audio_samples['variant'] == 'normal']
-        sample_phon = g.loc[audio_samples['variant'] == 'phoneme']#.reset_index(drop=True)
+        sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
        same, diff = get_siamese_pairs(sample_norm, sample_phon)
        same_data.extend([create_X(s) for s in same[:10]])
        diff_data.extend([create_X(d) for d in diff[:10]])
@@ -91,7 +97,8 @@ def create_speech_pairs_data(audio_group='audio'):
    print('pickling X/Y')
    np.save('outputs/X.npy', X)
    np.save('outputs/Y.npy', Y)
-    del X
+    del same_data
    del diff_data
    gc.collect()
    print('train/test splitting speech pairs')
    tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
@@ -101,25 +108,6 @@ def create_speech_pairs_data(audio_group='audio'):
    np.save('outputs/tr_y.npy', tr_y)
    np.save('outputs/te_y.npy', te_y)
 # def create_speech_model_data():
 #     (max_samples,sample_size) = pickle.load(open('./spectrogram_vars.pkl','rb'))
 #     x_data_pos = np.load('outputs/x_data_pos.npy')
 #     x_data_neg = np.load('outputs/x_data_neg.npy')
 #     x_pos_train, x_pos_test, x_neg_train, x_neg_test =train_test_split(x_data_pos,x_data_neg,test_size=0.1)
 #     del x_data_pos
 #     del x_data_neg
 #     gc.collect()
 #     print('split train and test')
 #     tr_y = np.array(x_pos_train.shape[0]*[1])
 #     te_y = np.array(x_pos_test.shape[0]*[[1,0]])
 #     tr_pairs = np.array([x_pos_train,x_neg_train]).reshape(x_pos_train.shape[0],2,max_samples,sample_size)
 #     te_pairs = np.array([x_pos_test,x_neg_test]).reshape(x_pos_test.shape[0],2,max_samples,sample_size)
 #     print('reshaped to input dim')
 #     np.save('outputs/tr_pairs.npy',tr_pairs)
 #     np.save('outputs/te_pairs.npy',te_pairs)
 #     np.save('outputs/tr_y.npy',tr_y)
 #     np.save('outputs/te_y.npy',te_y)
 #     print('pickled speech model data')
 def speech_model_data():
    tr_pairs = np.load('outputs/tr_pairs.npy') / 255.0
@@ -130,6 +118,7 @@ def speech_model_data():
    te_y = np.load('outputs/te_y.npy')
    return tr_pairs, te_pairs, tr_y, te_y
 if __name__ == '__main__':
    # sunflower_pairs_data()
    # create_spectrogram_data()
--- a/speech_siamese.py
+++ b/speech_siamese.py
@@ -1,4 +1,3 @@
 from __future__ import absolute_import
 from __future__ import print_function
 import numpy as np
@@ -12,8 +11,8 @@ from keras import backend as K
 def euclidean_distance(vects):
    x, y = vects
-    return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True),
+    return K.sqrt(
-                            K.epsilon()))
+        K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
 def eucl_dist_output_shape(shapes):
@@ -79,27 +78,40 @@ input_b = Input(shape=input_dim)
 processed_a = base_network(input_a)
 processed_b = base_network(input_b)
-distance = Lambda(euclidean_distance,
+distance = Lambda(
-                  output_shape=eucl_dist_output_shape)(
+    euclidean_distance,
-    [processed_a, processed_b]
+    output_shape=eucl_dist_output_shape)([processed_a, processed_b])
 )
 model = Model([input_a, input_b], distance)
-tb_cb = TensorBoard(log_dir='./logs/siamese_logs', histogram_freq=1,
+tb_cb = TensorBoard(
-                    batch_size=32, write_graph=True, write_grads=True,
+    log_dir='./logs/siamese_logs',
-                    write_images=True, embeddings_freq=0,
+    histogram_freq=1,
-                    embeddings_layer_names=None, embeddings_metadata=None)
+    batch_size=32,
    write_graph=True,
    write_grads=True,
    write_images=True,
    embeddings_freq=0,
    embeddings_layer_names=None,
    embeddings_metadata=None)
 cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_acc:0.2f}\
 -acc.h5'
-cp_cb = ModelCheckpoint(cp_file_fmt, monitor='val_acc', verbose=0,
+
-                        save_best_only=False, save_weights_only=False,
+cp_cb = ModelCheckpoint(
-                        mode='auto', period=1)
+    cp_file_fmt,
    monitor='val_acc',
    verbose=0,
    save_best_only=False,
    save_weights_only=False,
    mode='auto',
    period=1)
 # train
 rms = RMSprop(lr=0.001)
 sgd = SGD(lr=0.001)
 model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])
-model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
+model.fit(
    [tr_pairs[:, 0], tr_pairs[:, 1]],
    tr_y,
    batch_size=128,
    epochs=50,
    validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),
--- a/tts_samplegen.py
+++ b/tts_samplegen.py
@@ -1,28 +1,41 @@
 import objc
-from AppKit import NSSpeechSynthesizer,NSSpeechInputModeProperty,NSSpeechModePhoneme
+from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty
-from Foundation import NSURL,NSError,NSObject
+from AppKit import NSSpeechModePhoneme
 from Foundation import NSURL
 import json
 import random
 import os
 import re
 import subprocess
 OUTPUT_NAME = 'audio'
 dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
 dest_file = './outputs/' + OUTPUT_NAME + '.csv'
 def create_dir(direc):
    if not os.path.exists(direc):
        os.mkdir(direc)
-dest_filename = lambda n,v,r,t: '{}-{}-{}-{}-'.format(n,v,r,t)+str(random.randint(0,10000))+'.aiff'
+
-dest_path = lambda v,r,n: dest_dir+v+'/'+r+'/'+n
+
-dest_url = lambda p: NSURL.fileURLWithPath_(p)
+def dest_filename(n, v, r, t):
    return '{}-{}-{}-{}-'.format(n, v, r,
                                 t) + str(random.randint(0, 10000)) + '.aiff'
 def dest_path(v, r, n):
    return dest_dir + v + '/' + r + '/' + n
 def cli_gen_audio(speech_cmd, rate, voice, out_path):
-    subprocess.call(['say','-v',voice,'-r',str(rate),'-o',out_path,speech_cmd])
+    subprocess.call(
        ['say', '-v', voice, '-r',
         str(rate), '-o', out_path, speech_cmd])
 class SynthFile(object):
    """docstring for SynthFile."""
    def __init__(self, word, phon, filename, voice, rate, operation):
        super(SynthFile, self).__init__()
        self.word = word
@@ -33,23 +46,33 @@ class SynthFile(object):
        self.variant = operation
    def get_json(self):
-        return {'filename':self.filename,'voice':self.voice,
+        return {
-                'rate':self.rate,'operation':self.operation}
+            'filename': self.filename,
            'voice': self.voice,
            'rate': self.rate,
            'operation': self.operation
        }
    def get_csv(self):
-        return '{},{},{},{},{}\n'.format(self.word,self.phoneme,self.voice,self.rate,self.variant,self.filename)
+        return '{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
                                         self.rate, self.variant,
                                         self.filename)
 class SynthVariant(object):
    """docstring for SynthVariant."""
    def __init__(self, identifier, rate):
        super(SynthVariant, self).__init__()
        self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
        self.synth.setVolume_(100)
        self.synth.setRate_(rate)
-        self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
+        self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(
            identifier)
        self.phone_synth.setVolume_(100)
        self.phone_synth.setRate_(rate)
-        self.phone_synth.setObject_forProperty_error_(NSSpeechModePhoneme,NSSpeechInputModeProperty,None)
+        self.phone_synth.setObject_forProperty_error_(
            NSSpeechModePhoneme, NSSpeechInputModeProperty, None)
        self.identifier = identifier
        self.rate = rate
        self.name = identifier.split('.')[-1]
@@ -58,7 +81,8 @@ class SynthVariant(object):
        return 'Synthesizer[{} - {}]({})'.format(self.name, self.rate)
    def generate_audio(self, word, variant):
-        orig_phon,phoneme,phon_cmd = self.synth.phonemesFromText_(word),'',word
+        orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(
            word), '', word
        if variant == 'low':
            # self.synth.startSpeakingString_toURL_(word,d_url)
            phoneme = orig_phon
@@ -75,16 +99,23 @@ class SynthVariant(object):
        #     self.synth.startSpeakingString_toURL_(word,d_url)
        fname = dest_filename(word, phoneme, self.name, self.rate)
        d_path = dest_path(self.name, self.rate, fname)
-        d_url = dest_url(d_path)
+        # d_url = NSURL.fileURLWithPath_(d_path)
        cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
        return SynthFile(word, phoneme, fname, self.name, self.rate, variant)
 def synth_generator():
    voices_installed = NSSpeechSynthesizer.availableVoices()
-    voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in  voices_installed]
+    voice_attrs = [
-    us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
+        NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
-    # us_voices_ids = ['com.apple.speech.synthesis.voice.Fred','com.apple.speech.synthesis.voice.Alex',
+    ]
    us_voices_ids = [
        v['VoiceIdentifier'] for v in voice_attrs
        if v['VoiceLanguage'] == 'en-US'
        and v['VoiceIdentifier'].split('.')[-1][0].isupper()
    ]
    # us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',
    #                  'com.apple.speech.synthesis.voice.Alex',
    #                  'com.apple.speech.synthesis.voice.Victoria']
    # voice_rates = list(range(150,221,(220-180)//4))
    voice_rates = [150, 180, 210, 250]
@@ -94,6 +125,7 @@ def synth_generator():
        for r in voice_rates:
            create_dir(dest_dir + v + '/' + r)
            voice_synths.append(SynthVariant(v, r))
    def synth_for_words(words):
        all_synths = []
        for w in words:
@@ -101,8 +133,10 @@ def synth_generator():
                for v in ['low', 'medium', 'high']:
                    all_synths.append(s.generate_audio(w, v))
        return all_synths
    return synth_for_words
 def write_synths(synth_list, fname, csv=False):
    f = open(fname, 'w')
    if csv:
@@ -112,12 +146,14 @@ def write_synths(synth_list,fname,csv=False):
        json.dump([s.get_json() for s in synth_list], f)
    f.close()
 def generate_audio_for_stories():
    stories_data = json.load(open('./inputs/all_stories_hs.json'))
    word_list = [t[0] for i in stories_data.values() for t in i]
    words_audio_synth = synth_generator()
    return words_audio_synth(word_list)
 # words_audio_synth = synth_generator()
 # synth = NSSpeechSynthesizer.alloc().init()
 # voices_installed = NSSpeechSynthesizer.availableVoices()