9 changed files with 323 additions and 354 deletions
--- a/arpabet-to-apple.py
+++ b/arpabet-to-apple.py
@ -0,0 +1,55 @@
 #!/usr/bin/env python3
 """
 Convert ARPABET <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>
 to Apple's codes <https://developer.apple.com/library/content/documentation/UserExperience/Conceptual/SpeechSynthesisProgrammingGuide/Phonemes/Phonemes.html>
 """
 import sys
 mapping = {s.split()[0]: s.split()[1] for s in """
 AA AA
 AE AE
 AH UX
 AO AO
 AW AW
 AY AY
 B  b
 CH C
 D  d
 DH D
 EH EH
 ER UXr
 EY EY
 F  f
 G  g
 HH h
 IH IH
 IY IY
 JH J
 K  k
 L  l
 M  m
 N  n
 NG N
 OW OW
 OY OY
 P  p
 R  r
 S  s
 SH S
 T  t
 TH T
 UH UH
 UW UW
 V  v
 W  w
 Y  y
 Z  z
 ZH Z
 """.strip().split('\n')}
 arpabet_phonemes = sys.stdin.read().split()
 apple_phonemes = [mapping[p.upper()] for p in arpabet_phonemes]
 print('[[inpt PHON]] ' + ''.join(apple_phonemes))
--- a/create_triplets.py
+++ b/create_triplets.py
@ -0,0 +1,10 @@
 import pandas as pd
 audio_file = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','type','filename'])
 word_goups = audio_file.groupby('word')
 # audio
 lst = [1, 2, 3, 1, 2, 3]
 s = pd.Series([1, 2, 3, 10, 20, 30], lst)
 df3 = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
 s.groupby(level=0).sum()
--- a/record_mic_speech.py
+++ b/record_mic_speech.py
@ -1,42 +1,22 @@
 import pyaudio
 import numpy as np
-# from matplotlib import pyplot as plt
+from matplotlib import pyplot as plt
 from spectro_gen import plot_stft, generate_spectrogram
 CHUNKSIZE = 1024 # fixed chunk size
-def record_spectrogram(n_sec, plot=False, playback=False):
+# initialize portaudio
-    SAMPLE_RATE = 22050
+p = pyaudio.PyAudio()
-    N_CHANNELS = 2
+stream = p.open(format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=CHUNKSIZE)
-    N_SEC = n_sec
+
-    CHUNKSIZE = int(SAMPLE_RATE * N_SEC / N_CHANNELS)  # fixed chunk size
+# do this as long as you want fresh samples
    # show_record_prompt()
    input('Press [Enter] to start recording sample... ')
    p_inp = pyaudio.PyAudio()
    stream = p_inp.open(
        format=pyaudio.paFloat32,
        channels=N_CHANNELS,
        rate=SAMPLE_RATE,
        input=True,
        frames_per_buffer=CHUNKSIZE)
 data = stream.read(CHUNKSIZE)
-    numpydata = np.frombuffer(data, dtype=np.float32)
+numpydata = np.fromstring(data, dtype=np.int16)
-    multi_channel = np.abs(np.reshape(numpydata, (-1, 2))).mean(axis=1)
+
-    one_channel = np.asarray([multi_channel, -1 * multi_channel]).T.reshape(-1)
+# plot data
-    mean_channel_data = one_channel.tobytes()
+plt.plot(numpydata)
 plt.show()
 # close stream
 stream.stop_stream()
 stream.close()
-    p_inp.terminate()
+p.terminate()
    if plot:
        plot_stft(one_channel, SAMPLE_RATE)
    if playback:
        p_oup = pyaudio.PyAudio()
        stream = p_oup.open(
            format=pyaudio.paFloat32,
            channels=2,
            rate=SAMPLE_RATE,
            output=True)
        stream.write(mean_channel_data)
        stream.close()
        p_oup.terminate()
    ims, _ = generate_spectrogram(one_channel, SAMPLE_RATE)
    return ims
--- a/snippets.py
+++ b/snippets.py
@ -0,0 +1,12 @@
 # import scipy.signal as sg
 # import pysndfile.sndio as snd
 #
 # snd_data,samples,_ = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')
 # samples_per_seg = 3*int(samples*150/(3*1000))
 # # samples/(len(snd_data)*1000.0)
 # len(snd_data)
 # samples_per_seg/2
 #
 # len(sg.spectrogram(snd_data,nperseg=samples_per_seg,noverlap=samples_per_seg/3)[2])
 #
 # from spectro_gen import generate_aiff_spectrogram
--- a/spectro_gen.py
+++ b/spectro_gen.py
@ -1,19 +1,16 @@
 #!/usr/bin/env python
-
+#coding: utf-8
-""" This work is licensed under a Creative Commons Attribution 3.0 Unported
+""" This work is licensed under a Creative Commons Attribution 3.0 Unported License.
    License.
    Frank Zalkow, 2012-2013
    http://www.frank-zalkow.de/en/code-snippets/create-audio-spectrograms-with-python.html?i=1
 """
 # %matplotlib inline
 import numpy as np
 import pyaudio
 from matplotlib import pyplot as plt
 from pysndfile import sndio as snd
 from numpy.lib import stride_tricks
 """ short time fourier transform of audio signal """
 def stft(sig, frameSize, overlapFac=0.5, window=np.hanning):
    win = window(frameSize)
    hopSize = int(frameSize - np.floor(overlapFac * frameSize))
@ -29,18 +26,12 @@ def stft(sig, frameSize, overlapFac=0.5, window=np.hanning):
    # zeros at end (thus samples can be fully covered by frames)
    samples = np.append(samples, np.zeros(frameSize))
-    frames = stride_tricks.as_strided(
+    frames = stride_tricks.as_strided(samples, shape=(cols, frameSize), strides=(samples.strides[0]*hopSize, samples.strides[0])).copy()
        samples,
        shape=(cols, frameSize),
        strides=(samples.strides[0] * hopSize, samples.strides[0])).copy()
    frames *= win
    return np.fft.rfft(frames)
 """ scale frequency axis logarithmically """
 def logscale_spec(spec, sr=44100, factor=20.):
    timebins, freqbins = np.shape(spec)
@ -67,36 +58,32 @@ def logscale_spec(spec, sr=44100, factor=20.):
    return newspec, freqs
 """ generate spectrogram for aiff audio with 150ms windows and 50ms overlap"""
-
+def generate_aiff_spectrogram(audiopath):
-
+    samples,samplerate,_ = snd.read(audiopath)
 def generate_spectrogram(samples, samplerate):
    # samplerate, samples = wav.read(audiopath)
    # s = stft(samples, binsize)
    s = stft(samples, samplerate*150//1000,1.0/3)
    sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate)
    ims = 20.*np.log10(np.abs(sshow)/10e-6)
    return ims, freq
 def generate_aiff_spectrogram(audiopath):
    samples, samplerate, _ = snd.read(audiopath)
    ims, _ = generate_spectrogram(samples, samplerate)
    return ims
 """ plot spectrogram"""
 def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
    samples,samplerate,_ = snd.read(audiopath)
    # samplerate, samples = wav.read(audiopath)
    # s = stft(samples, binsize)
    # print(samplerate*150//1000)
    s = stft(samples, samplerate*150//1000,1.0/3)
    sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate)
    ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel
 def plot_stft(samples, samplerate, binsize=2**10, plotpath=None, colormap="jet"):
    (ims, freq) = generate_spectrogram(samples, samplerate)
    timebins, freqbins = np.shape(ims)
    # import pdb;pdb.set_trace()
    plt.figure(figsize=(15, 7.5))
-    plt.imshow(
+    plt.imshow(np.transpose(ims), origin="lower", aspect="auto", cmap=colormap, interpolation="none")
        np.transpose(ims),
        origin="lower",
        aspect="auto",
        cmap=colormap,
        interpolation="none")
    plt.colorbar()
    plt.xlabel("time (s)")
@ -105,44 +92,20 @@ def plot_stft(samples, samplerate, binsize=2**10, plotpath=None, colormap="jet")
    plt.ylim([0, freqbins])
    xlocs = np.float32(np.linspace(0, timebins-1, 5))
-    plt.xticks(xlocs, [
+    plt.xticks(xlocs, ["%.02f" % l for l in ((xlocs*len(samples)/timebins)+(0.5*binsize))/samplerate])
        "%.02f" % l
        for l in (
            (xlocs * len(samples) / timebins) + (0.5 * binsize)) / samplerate
    ])
    ylocs = np.int16(np.round(np.linspace(0, freqbins-1, 10)))
    plt.yticks(ylocs, ["%.02f" % freq[i] for i in ylocs])
    if plotpath:
        plt.savefig(plotpath, bbox_inches="tight")
    else:
        plt.show()
    plt.clf()
 def plot_aiff_stft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
    samples, samplerate, _ = snd.read(audiopath)
    plot_stft(samples, samplerate)
 def play_sunflower():
    sample_r = snd.get_info('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
    snd_data_f64 = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
    snd_data_f32 = snd_data_f64.astype(np.float32)
    print(snd_data_f32.shape)
    snd_data = snd_data_f32.tobytes()
    p_oup = pyaudio.PyAudio()
    stream = p_oup.open(
        format=pyaudio.paFloat32, channels=1, rate=sample_r, output=True)
    stream.write(snd_data)
    stream.close()
    p_oup.terminate()
    plot_stft(snd_data_f32, sample_r)
 if __name__ == '__main__':
-    play_sunflower()
+    plotstft('./outputs/sunflowers-Alex-150-normal-589.aiff')
-    # plot_aiff_stft('./outputs/sunflowers-Alex-150-normal-589.aiff')
+    plotstft('./outputs/sunflowers-Alex-180-normal-4763.aiff')
-    # plot_aiff_stft('./outputs/sunflowers-Alex-180-normal-4763.aiff')
+    plotstft('./outputs/sunflowers-Victoria-180-normal-870.aiff')
-    # plot_aiff_stft('./outputs/sunflowers-Victoria-180-normal-870.aiff')
+    plotstft('./outputs/sunflowers-Fred-180-phoneme-9733.aiff')
-    # plot_aiff_stft('./outputs/sunflowers-Fred-180-phoneme-9733.aiff')
+    plotstft('./outputs/sunflowers-Fred-180-normal-6515.aiff')
    # plot_aiff_stft('./outputs/sunflowers-Fred-180-normal-6515.aiff')
--- a/speech_data.py
+++ b/speech_data.py
@ -3,38 +3,37 @@ import numpy as np
 from spectro_gen import generate_aiff_spectrogram
 from sklearn.model_selection import train_test_split
 import itertools
-import gc
+import pickle,gc
 def sunflower_data():
    audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file'])
    sunflowers = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
    sunflowers.loc[:,'file'] = sunflowers.loc[:,'file'].apply(lambda x:'outputs/'+x).apply(generate_aiff_spectrogram)
    y_data = sunflowers['variant'].apply(lambda x:x=='normal').values
    max_samples = sunflowers['file'].apply(lambda x:x.shape[0]).max()
    sample_size = sunflowers['file'][0].shape[1]
    sample_count = sunflowers['file'].shape[0]
    sunflowers['file'][0].shape[0]
    def append_zeros(spgr):
        orig = spgr.shape[0]
        return np.lib.pad(spgr,[(0, max_samples-orig), (0,0)],'median')
    pad_sun = sunflowers['file'].apply(append_zeros).values
    x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size,))
    return train_test_split(x_data,y_data,test_size=0.33)
 def get_siamese_pairs(groupF1,groupF2):
    group1 = [r for (i,r) in groupF1.iterrows()]
    group2 = [r for (i,r) in groupF2.iterrows()]
    f = [(g1,g2) for g2 in group2 for g1 in group1]
-    t = [i for i in itertools.combinations(group1, 2)
+    t = [i for i in itertools.combinations(group1,2)]+[i for i in itertools.combinations(group2,2)]
         ] + [i for i in itertools.combinations(group2, 2)]
    return (t,f)
 def create_X(sp, max_samples):
    def append_zeros(spgr):
        return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
                          'median')
    l_sample = append_zeros(sp[0]['spectrogram'])
    r_sample = append_zeros(sp[1]['spectrogram'])
    return np.asarray([l_sample, r_sample])
 def sunflower_pairs_data():
-    audio_samples = pd.read_csv(
+    audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file'])
-        './outputs/audio.csv',
+    audio_samples = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
-        names=['word', 'voice', 'rate', 'variant', 'file'])
+    audio_samples.loc[:,'spectrogram'] = audio_samples.loc[:,'file'].apply(lambda x:'outputs/audio/'+x).apply(generate_aiff_spectrogram)
-    audio_samples = audio_samples.loc[audio_samples['word'] ==
+    max_samples = audio_samples['spectrogram'].apply(lambda x:x.shape[0]).max()
-                                      'sunflowers'].reset_index(drop=True)
+    sample_size = audio_samples['spectrogram'][0].shape[1]
    audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply(
        lambda x: 'outputs/audio/' + x).apply(generate_aiff_spectrogram)
    max_samples = audio_samples['spectrogram'].apply(
        lambda x: x.shape[0]).max()
    same_data,diff_data = [],[]
    for (w,g) in audio_samples.groupby(audio_samples['word']):
        sample_norm = g.loc[audio_samples['variant'] == 'normal']
@ -44,39 +43,47 @@ def sunflower_pairs_data():
        diff_data.extend(diff)
    Y = np.hstack([np.ones(len(same_data)),np.zeros(len(diff_data))])
    X_sample_pairs = same_data+diff_data
-
+    def append_zeros(spgr):
-    X_list = (create_X(sp, max_samples) for sp in X_sample_pairs)
+        sample = np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median')
        return np.expand_dims(sample,axis=0)
    def create_X(sp):
        # sample_count = sp[0]['file'].shape[0]
        l_sample = append_zeros(sp[0]['spectrogram'])
        r_sample = append_zeros(sp[1]['spectrogram'])#.apply(append_zeros).values
        # x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size))
        return np.expand_dims(np.vstack([l_sample,r_sample]),axis=0)
    X_list = (create_X(sp) for sp in X_sample_pairs)
    X = np.vstack(X_list)
    tr_pairs,te_pairs,tr_y,te_y = train_test_split(X,Y,test_size=0.1)
    return train_test_split(X,Y,test_size=0.1)
 def create_spectrogram_data(audio_group='audio'):
-    audio_samples = pd.read_csv(
+    audio_samples = pd.read_csv('./outputs/'+audio_group+'.csv',names=['word','voice','rate','variant','file'])
-        './outputs/' + audio_group + '.csv',
+    # audio_samples = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
-        names=['word', 'voice', 'rate', 'variant', 'file'])
+    audio_samples.loc[:,'spectrogram'] = audio_samples.loc[:,'file'].apply(lambda x:'outputs/'+audio_group+'/'+x).apply(generate_aiff_spectrogram)
    # audio_samples = audio_samples.loc[audio_samples['word'] ==
    #                                   'sunflowers'].reset_index(drop=True)
    audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply(
        lambda x: 'outputs/' + audio_group + '/' + x).apply(
            generate_aiff_spectrogram)
    audio_samples.to_pickle('outputs/spectrogram.pkl')
 def create_speech_pairs_data(audio_group='audio'):
    audio_samples = pd.read_pickle('outputs/spectrogram.pkl')
-    max_samples = audio_samples['spectrogram'].apply(
+    max_samples = audio_samples['spectrogram'].apply(lambda x:x.shape[0]).max()
-        lambda x: x.shape[0]).max()
+    sample_size = audio_samples['spectrogram'][0].shape[1]
-    # sample_size = audio_samples['spectrogram'][0].shape[1]
+
    def append_zeros(spgr):
        sample = np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median')
        return sample
    def create_X(sp):
        l_sample = append_zeros(sp[0]['spectrogram'])
        r_sample = append_zeros(sp[1]['spectrogram'])
        return np.asarray([l_sample,r_sample])
    print('generating siamese speech pairs')
    same_data,diff_data = [],[]
    for (w,g) in audio_samples.groupby(audio_samples['word']):
-        sample_norm = g.loc[audio_samples['variant'] == 'normal']
+        sample_norm = g.loc[audio_samples['variant'] == 'normal']#.reset_index(drop=True)
-        sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
+        sample_phon = g.loc[audio_samples['variant'] == 'phoneme']#.reset_index(drop=True)
        same , diff = get_siamese_pairs(sample_norm,sample_phon)
-        same_data.extend([create_X(s, max_samples) for s in same[:10]])
+        same_data.extend([create_X(s) for s in same[:10]])
-        diff_data.extend([create_X(d, max_samples) for d in diff[:10]])
+        diff_data.extend([create_X(d) for d in diff[:10]])
    print('creating all speech pairs')
    Y = np.hstack([np.ones(len(same_data)),np.zeros(len(diff_data))])
    print('casting as array speech pairs')
@ -84,8 +91,7 @@ def create_speech_pairs_data(audio_group='audio'):
    print('pickling X/Y')
    np.save('outputs/X.npy',X)
    np.save('outputs/Y.npy',Y)
-    del same_data
+    del X
    del diff_data
    gc.collect()
    print('train/test splitting speech pairs')
    tr_pairs,te_pairs,tr_y,te_y = train_test_split(X,Y,test_size=0.1)
@ -95,6 +101,25 @@ def create_speech_pairs_data(audio_group='audio'):
    np.save('outputs/tr_y.npy',tr_y)
    np.save('outputs/te_y.npy',te_y)
 # def create_speech_model_data():
 #     (max_samples,sample_size) = pickle.load(open('./spectrogram_vars.pkl','rb'))
 #     x_data_pos = np.load('outputs/x_data_pos.npy')
 #     x_data_neg = np.load('outputs/x_data_neg.npy')
 #     x_pos_train, x_pos_test, x_neg_train, x_neg_test =train_test_split(x_data_pos,x_data_neg,test_size=0.1)
 #     del x_data_pos
 #     del x_data_neg
 #     gc.collect()
 #     print('split train and test')
 #     tr_y = np.array(x_pos_train.shape[0]*[1])
 #     te_y = np.array(x_pos_test.shape[0]*[[1,0]])
 #     tr_pairs = np.array([x_pos_train,x_neg_train]).reshape(x_pos_train.shape[0],2,max_samples,sample_size)
 #     te_pairs = np.array([x_pos_test,x_neg_test]).reshape(x_pos_test.shape[0],2,max_samples,sample_size)
 #     print('reshaped to input dim')
 #     np.save('outputs/tr_pairs.npy',tr_pairs)
 #     np.save('outputs/te_pairs.npy',te_pairs)
 #     np.save('outputs/tr_y.npy',tr_y)
 #     np.save('outputs/te_y.npy',te_y)
 #     print('pickled speech model data')
 def speech_model_data():
    tr_pairs = np.load('outputs/tr_pairs.npy')/255.0
@ -105,7 +130,6 @@ def speech_model_data():
    te_y = np.load('outputs/te_y.npy')
    return tr_pairs,te_pairs,tr_y,te_y
 if __name__ == '__main__':
    # sunflower_pairs_data()
    #create_spectrogram_data()
--- a/speech_siamese.py
+++ b/speech_siamese.py
@ -1,18 +1,19 @@
 from __future__ import absolute_import
 from __future__ import print_function
 import numpy as np
 from speech_data import speech_model_data
-from keras.models import Model,load_model
+from keras.models import Model
 from keras.layers import Input, Dense, Dropout, LSTM, Lambda
-from keras.optimizers import RMSprop
+from keras.optimizers import RMSprop, SGD
 from keras.callbacks import TensorBoard, ModelCheckpoint
 from keras import backend as K
 def euclidean_distance(vects):
    x, y = vects
-    return K.sqrt(
+    return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True),
-        K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
+                            K.epsilon()))
 def eucl_dist_output_shape(shapes):
@ -63,7 +64,6 @@ def accuracy(y_true, y_pred):
    return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))
 def train_siamese():
 # the data, shuffled and split between train and test sets
 tr_pairs, te_pairs, tr_y, te_y = speech_model_data()
 input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])
@ -79,39 +79,27 @@ def train_siamese():
 processed_a = base_network(input_a)
 processed_b = base_network(input_b)
-    distance = Lambda(
+distance = Lambda(euclidean_distance,
-        euclidean_distance,
+                  output_shape=eucl_dist_output_shape)(
-        output_shape=eucl_dist_output_shape)([processed_a, processed_b])
+    [processed_a, processed_b]
 )
 model = Model([input_a, input_b], distance)
-    tb_cb = TensorBoard(
+tb_cb = TensorBoard(log_dir='./logs/siamese_logs', histogram_freq=1,
-        log_dir='./logs/siamese_logs',
+                    batch_size=32, write_graph=True, write_grads=True,
-        histogram_freq=1,
+                    write_images=True, embeddings_freq=0,
-        batch_size=32,
+                    embeddings_layer_names=None, embeddings_metadata=None)
        write_graph=True,
        write_grads=True,
        write_images=True,
        embeddings_freq=0,
        embeddings_layer_names=None,
        embeddings_metadata=None)
 cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_acc:0.2f}\
 -acc.h5'
-
+cp_cb = ModelCheckpoint(cp_file_fmt, monitor='val_acc', verbose=0,
-    cp_cb = ModelCheckpoint(
+                        save_best_only=False, save_weights_only=False,
-        cp_file_fmt,
+                        mode='auto', period=1)
        monitor='val_acc',
        verbose=0,
        save_best_only=False,
        save_weights_only=False,
        mode='auto',
        period=1)
 # train
 rms = RMSprop(lr=0.001)
 sgd = SGD(lr=0.001)
 model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])
-    model.fit(
+model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
        [tr_pairs[:, 0], tr_pairs[:, 1]],
        tr_y,
          batch_size=128,
          epochs=50,
          validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),
@ -126,23 +114,3 @@ def train_siamese():
 print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
 print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
 def trained_siamese_model():
    # input_dim = (15, 1654)
    # base_network = create_base_rnn_network(input_dim)
    # input_a = Input(shape=input_dim)
    # input_b = Input(shape=input_dim)
    # processed_a = base_network(input_a)
    # processed_b = base_network(input_b)
    # distance = Lambda(
    #     euclidean_distance,
    #     output_shape=eucl_dist_output_shape)([processed_a, processed_b])
    #
    # model = Model([input_a, input_b], distance)
    model = load_model('./models/siamese_speech_model-final.h5')
    return model
 if __name__ == '__main__':
    train_siamese()
--- a/test_siamese.py
+++ b/test_siamese.py
@ -1,7 +0,0 @@
 # from speech_siamese import trained_siamese_model
 from keras.models import load_model
 from record_mic_speech import record_spectrogram
 model = load_model('./models/siamese_speech_model-final.h5')
 spec1 = record_spectrogram(n_sec=1.2)
 spec2 = record_spectrogram(n_sec=1.2)
--- a/tts_samplegen.py
+++ b/tts_samplegen.py
@ -1,41 +1,28 @@
 import objc
-from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty
+from AppKit import NSSpeechSynthesizer,NSSpeechInputModeProperty,NSSpeechModePhoneme
-from AppKit import NSSpeechModePhoneme
+from Foundation import NSURL,NSError,NSObject
 from Foundation import NSURL
 import json
 import random
 import os
 import re
 import subprocess
 OUTPUT_NAME = 'audio'
 dest_dir = os.path.abspath('.')+'/outputs/'+OUTPUT_NAME+'/'
 dest_file = './outputs/'+OUTPUT_NAME+'.csv'
 def create_dir(direc):
    if not os.path.exists(direc):
        os.mkdir(direc)
-
+dest_filename = lambda n,v,r,t: '{}-{}-{}-{}-'.format(n,v,r,t)+str(random.randint(0,10000))+'.aiff'
-
+dest_path = lambda v,r,n: dest_dir+v+'/'+r+'/'+n
-def dest_filename(n, v, r, t):
+dest_url = lambda p: NSURL.fileURLWithPath_(p)
    return '{}-{}-{}-{}-'.format(n, v, r,
                                 t) + str(random.randint(0, 10000)) + '.aiff'
 def dest_path(v, r, n):
    return dest_dir + v + '/' + r + '/' + n
 def cli_gen_audio(speech_cmd,rate,voice,out_path):
-    subprocess.call(
+    subprocess.call(['say','-v',voice,'-r',str(rate),'-o',out_path,speech_cmd])
        ['say', '-v', voice, '-r',
         str(rate), '-o', out_path, speech_cmd])
 class SynthFile(object):
    """docstring for SynthFile."""
    def __init__(self,word,phon, filename,voice,rate,operation):
        super(SynthFile, self).__init__()
        self.word = word
@ -46,33 +33,23 @@ class SynthFile(object):
        self.variant = operation
    def get_json(self):
-        return {
+        return {'filename':self.filename,'voice':self.voice,
-            'filename': self.filename,
+                'rate':self.rate,'operation':self.operation}
            'voice': self.voice,
            'rate': self.rate,
            'operation': self.operation
        }
    def get_csv(self):
-        return '{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
+        return '{},{},{},{},{}\n'.format(self.word,self.phoneme,self.voice,self.rate,self.variant,self.filename)
                                         self.rate, self.variant,
                                         self.filename)
 class SynthVariant(object):
    """docstring for SynthVariant."""
    def __init__(self,identifier,rate):
        super(SynthVariant, self).__init__()
        self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
        self.synth.setVolume_(100)
        self.synth.setRate_(rate)
-        self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(
+        self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
            identifier)
        self.phone_synth.setVolume_(100)
        self.phone_synth.setRate_(rate)
-        self.phone_synth.setObject_forProperty_error_(
+        self.phone_synth.setObject_forProperty_error_(NSSpeechModePhoneme,NSSpeechInputModeProperty,None)
            NSSpeechModePhoneme, NSSpeechInputModeProperty, None)
        self.identifier = identifier
        self.rate = rate
        self.name = identifier.split('.')[-1]
@ -81,8 +58,7 @@ class SynthVariant(object):
        return 'Synthesizer[{} - {}]({})'.format(self.name,self.rate)
    def generate_audio(self,word,variant):
-        orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(
+        orig_phon,phoneme,phon_cmd = self.synth.phonemesFromText_(word),'',word
            word), '', word
        if variant == 'low':
            # self.synth.startSpeakingString_toURL_(word,d_url)
            phoneme = orig_phon
@ -99,23 +75,16 @@ class SynthVariant(object):
            #     self.synth.startSpeakingString_toURL_(word,d_url)
        fname = dest_filename(word,phoneme,self.name,self.rate)
        d_path = dest_path(self.name,self.rate,fname)
-        # d_url = NSURL.fileURLWithPath_(d_path)
+        d_url = dest_url(d_path)
        cli_gen_audio(phon_cmd,self.rate,self.name,d_path)
        return SynthFile(word,phoneme,fname,self.name,self.rate,variant)
 def synth_generator():
    voices_installed = NSSpeechSynthesizer.availableVoices()
-    voice_attrs = [
+    voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in  voices_installed]
-        NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
+    us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
-    ]
+    # us_voices_ids = ['com.apple.speech.synthesis.voice.Fred','com.apple.speech.synthesis.voice.Alex',
    us_voices_ids = [
        v['VoiceIdentifier'] for v in voice_attrs
        if v['VoiceLanguage'] == 'en-US'
        and v['VoiceIdentifier'].split('.')[-1][0].isupper()
    ]
    # us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',
    #                  'com.apple.speech.synthesis.voice.Alex',
    #                  'com.apple.speech.synthesis.voice.Victoria']
    # voice_rates = list(range(150,221,(220-180)//4))
    voice_rates = [150,180,210,250]
@ -125,7 +94,6 @@ def synth_generator():
        for r in voice_rates:
            create_dir(dest_dir+v+'/'+r)
            voice_synths.append(SynthVariant(v,r))
    def synth_for_words(words):
        all_synths = []
        for w in words:
@ -133,10 +101,8 @@ def synth_generator():
                for v in ['low','medium','high']:
                    all_synths.append(s.generate_audio(w,v))
        return all_synths
    return synth_for_words
 def write_synths(synth_list,fname,csv=False):
    f = open(fname,'w')
    if csv:
@ -146,14 +112,12 @@ def write_synths(synth_list, fname, csv=False):
        json.dump([s.get_json() for s in synth_list],f)
    f.close()
 def generate_audio_for_stories():
    stories_data = json.load(open('./inputs/all_stories_hs.json'))
    word_list = [t[0] for i in stories_data.values() for t in i]
    words_audio_synth = synth_generator()
    return words_audio_synth(word_list)
 # words_audio_synth = synth_generator()
 # synth = NSSpeechSynthesizer.alloc().init()
 # voices_installed = NSSpeechSynthesizer.availableVoices()