formatted

2017-10-25 13:36:41 +05:30
parent e6f0c8b21b
commit 82d0398d2c
8 changed files with 275 additions and 252 deletions
--- a/arpabet-to-apple.py
+++ b/arpabet-to-apple.py
@@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Convert ARPABET <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>
-to Apple's codes <https://developer.apple.com/library/content/documentation/UserExperience/Conceptual/SpeechSynthesisProgrammingGuide/Phonemes/Phonemes.html>
-"""
-
-import sys
-
-
-mapping = {s.split()[0]: s.split()[1] for s in """
-AA AA
-AE AE
-AH UX
-AO AO
-AW AW
-AY AY
-B  b
-CH C
-D  d
-DH D
-EH EH
-ER UXr
-EY EY
-F  f
-G  g
-HH h
-IH IH
-IY IY
-JH J
-K  k
-L  l
-M  m
-N  n
-NG N
-OW OW
-OY OY
-P  p
-R  r
-S  s
-SH S
-T  t
-TH T
-UH UH
-UW UW
-V  v
-W  w
-Y  y
-Z  z
-ZH Z
-""".strip().split('\n')}
-    
-arpabet_phonemes = sys.stdin.read().split()
-apple_phonemes = [mapping[p.upper()] for p in arpabet_phonemes]
-print('[[inpt PHON]] ' + ''.join(apple_phonemes))
--- a/create_triplets.py
+++ b/create_triplets.py
@@ -1,10 +0,0 @@
-import pandas as pd
-
-audio_file = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','type','filename'])
-word_goups = audio_file.groupby('word')
-# audio
-lst = [1, 2, 3, 1, 2, 3]
-s = pd.Series([1, 2, 3, 10, 20, 30], lst)
-df3 = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
-
-s.groupby(level=0).sum()
--- a/record_mic_speech.py
+++ b/record_mic_speech.py
@@ -2,14 +2,24 @@ import pyaudio
 import numpy as np
 from matplotlib import pyplot as plt

-CHUNKSIZE = 1024 # fixed chunk size
+CHUNKSIZE = 44100 * 10  # fixed chunk size

 # initialize portaudio
-p = pyaudio.PyAudio()
-stream = p.open(format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=CHUNKSIZE)
+p_inp = pyaudio.PyAudio()
+# dev_n = p.get_device_count()
+# dev_infos = [p.get_device_info_by_index(index) for index in range(dev_n)]
+# [i for i in dev_infos] # if i['name'] == 'record']
+stream = p_inp.open(
+    format=pyaudio.paInt24,
+    channels=2,
+    rate=44100,
+    input=True,
+    frames_per_buffer=CHUNKSIZE)

 # do this as long as you want fresh samples
 data = stream.read(CHUNKSIZE)
+len(data)
+CHUNKSIZE*10
 numpydata = np.fromstring(data, dtype=np.int16)

 # plot data
@@ -19,4 +29,27 @@ plt.show()
 # close stream
 stream.stop_stream()
 stream.close()
-p.terminate()
+p_inp.terminate()
+# open the file for reading.
+# wf = wave.open(sys.argv[1], 'rb')
+
+# create an audio object
+# p = pyaudio.PyAudio()
+
+# open stream based on the wave object which has been input.
+p_oup = pyaudio.PyAudio()
+stream = p_oup.open(
+    format=pyaudio.paInt24, channels=2, rate=44100, output=True)
+
+# read data (based on the chunk size)
+# data = wf.readframes(CHUNKSIZE)
+
+# play stream (looping from beginning of file to the end)
+# while data != '':
+# writing to the stream is what *actually* plays the sound.
+stream.write(data)
+# data = wf.readframes(chunk)
+
+# cleanup stuff.
+stream.close()
+p_oup.terminate()
--- a/snippets.py
+++ b/snippets.py
@@ -1,12 +0,0 @@
-# import scipy.signal as sg
-# import pysndfile.sndio as snd
-#
-# snd_data,samples,_ = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')
-# samples_per_seg = 3*int(samples*150/(3*1000))
-# # samples/(len(snd_data)*1000.0)
-# len(snd_data)
-# samples_per_seg/2
-#
-# len(sg.spectrogram(snd_data,nperseg=samples_per_seg,noverlap=samples_per_seg/3)[2])
-#
-# from spectro_gen import generate_aiff_spectrogram
--- a/spectro_gen.py
+++ b/spectro_gen.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
-#coding: utf-8
-""" This work is licensed under a Creative Commons Attribution 3.0 Unported License.
+
+""" This work is licensed under a Creative Commons Attribution 3.0 Unported
+    License.
    Frank Zalkow, 2012-2013
    http://www.frank-zalkow.de/en/code-snippets/create-audio-spectrograms-with-python.html?i=1
 """
@@ -9,8 +10,9 @@ import numpy as np
 from matplotlib import pyplot as plt
 from pysndfile import sndio as snd
 from numpy.lib import stride_tricks
-
 """ short time fourier transform of audio signal """
+
+
 def stft(sig, frameSize, overlapFac=0.5, window=np.hanning):
    win = window(frameSize)
    hopSize = int(frameSize - np.floor(overlapFac * frameSize))
@@ -18,82 +20,103 @@ def stft(sig, frameSize, overlapFac=0.5, window=np.hanning):
    # zeros at beginning (thus center of 1st window should be for sample nr. 0)
    # sig = (sig*255).astype(np.uint8)
    # import pdb;pdb.set_trace()
-    count = int(np.floor(frameSize/2.0))
+    count = int(np.floor(frameSize / 2.0))
    # import pdb;pdb.set_trace()
    samples = np.append(np.zeros(count), sig)
    # cols for windowing
-    cols = int(np.ceil( (len(samples) - frameSize) / float(hopSize)) + 1)
+    cols = int(np.ceil((len(samples) - frameSize) / float(hopSize)) + 1)
    # zeros at end (thus samples can be fully covered by frames)
    samples = np.append(samples, np.zeros(frameSize))

-    frames = stride_tricks.as_strided(samples, shape=(cols, frameSize), strides=(samples.strides[0]*hopSize, samples.strides[0])).copy()
+    frames = stride_tricks.as_strided(
+        samples,
+        shape=(cols, frameSize),
+        strides=(samples.strides[0] * hopSize, samples.strides[0])).copy()
    frames *= win

    return np.fft.rfft(frames)

+
 """ scale frequency axis logarithmically """
+
+
 def logscale_spec(spec, sr=44100, factor=20.):
    timebins, freqbins = np.shape(spec)

-    scale = np.linspace(0, 1, freqbins) ** factor
-    scale *= (freqbins-1)/max(scale)
+    scale = np.linspace(0, 1, freqbins)**factor
+    scale *= (freqbins - 1) / max(scale)
    scale = np.unique(np.round(scale)).astype(np.uint32)
    # import pdb;pdb.set_trace()
    # create spectrogram with new freq bins
    newspec = np.complex128(np.zeros([timebins, len(scale)]))
    for i in range(0, len(scale)):
-        if i == len(scale)-1:
-            newspec[:,i] = np.sum(spec[:,scale[i]:], axis=1)
+        if i == len(scale) - 1:
+            newspec[:, i] = np.sum(spec[:, scale[i]:], axis=1)
        else:
-            newspec[:,i] = np.sum(spec[:,scale[i]:scale[i+1]], axis=1)
+            newspec[:, i] = np.sum(spec[:, scale[i]:scale[i + 1]], axis=1)

    # list center freq of bins
-    allfreqs = np.abs(np.fft.fftfreq(freqbins*2, 1./sr)[:freqbins+1])
+    allfreqs = np.abs(np.fft.fftfreq(freqbins * 2, 1. / sr)[:freqbins + 1])
    freqs = []
    for i in range(0, len(scale)):
-        if i == len(scale)-1:
+        if i == len(scale) - 1:
            freqs += [np.mean(allfreqs[scale[i]:])]
        else:
-            freqs += [np.mean(allfreqs[scale[i]:scale[i+1]])]
+            freqs += [np.mean(allfreqs[scale[i]:scale[i + 1]])]

    return newspec, freqs

+
 """ generate spectrogram for aiff audio with 150ms windows and 50ms overlap"""
+
+
 def generate_aiff_spectrogram(audiopath):
-    samples,samplerate,_ = snd.read(audiopath)
+    samples, samplerate, _ = snd.read(audiopath)
    # samplerate, samples = wav.read(audiopath)
    # s = stft(samples, binsize)
-    s = stft(samples, samplerate*150//1000,1.0/3)
+    s = stft(samples, samplerate * 150 // 1000, 1.0 / 3)

    sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate)
-    ims = 20.*np.log10(np.abs(sshow)/10e-6)
+    ims = 20. * np.log10(np.abs(sshow) / 10e-6)
    return ims

+
 """ plot spectrogram"""
+
+
 def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
-    samples,samplerate,_ = snd.read(audiopath)
+    samples, samplerate, _ = snd.read(audiopath)
    # samplerate, samples = wav.read(audiopath)
    # s = stft(samples, binsize)
    # print(samplerate*150//1000)
-    s = stft(samples, samplerate*150//1000,1.0/3)
+    s = stft(samples, samplerate * 150 // 1000, 1.0 / 3)

    sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate)
-    ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel
+    ims = 20. * np.log10(np.abs(sshow) / 10e-6)  # amplitude to decibel

    timebins, freqbins = np.shape(ims)
    # import pdb;pdb.set_trace()
    plt.figure(figsize=(15, 7.5))
-    plt.imshow(np.transpose(ims), origin="lower", aspect="auto", cmap=colormap, interpolation="none")
+    plt.imshow(
+        np.transpose(ims),
+        origin="lower",
+        aspect="auto",
+        cmap=colormap,
+        interpolation="none")
    plt.colorbar()

    plt.xlabel("time (s)")
    plt.ylabel("frequency (hz)")
-    plt.xlim([0, timebins-1])
+    plt.xlim([0, timebins - 1])
    plt.ylim([0, freqbins])

-    xlocs = np.float32(np.linspace(0, timebins-1, 5))
-    plt.xticks(xlocs, ["%.02f" % l for l in ((xlocs*len(samples)/timebins)+(0.5*binsize))/samplerate])
-    ylocs = np.int16(np.round(np.linspace(0, freqbins-1, 10)))
+    xlocs = np.float32(np.linspace(0, timebins - 1, 5))
+    plt.xticks(xlocs, [
+        "%.02f" % l
+        for l in (
+            (xlocs * len(samples) / timebins) + (0.5 * binsize)) / samplerate
+    ])
+    ylocs = np.int16(np.round(np.linspace(0, freqbins - 1, 10)))
    plt.yticks(ylocs, ["%.02f" % freq[i] for i in ylocs])

    if plotpath:
@@ -103,6 +126,13 @@ def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):

    plt.clf()

+
+snd.get_info('./outputs/sunflowers-Alex-150-normal-589.aiff')
+snd_data_arr = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
+snd_data = snd_data_arr.tobytes()
+snd_data_arr.dtype
+len(snd_data)
+
 if __name__ == '__main__':
    plotstft('./outputs/sunflowers-Alex-150-normal-589.aiff')
    plotstft('./outputs/sunflowers-Alex-180-normal-4763.aiff')
--- a/speech_data.py
+++ b/speech_data.py
@@ -3,135 +3,124 @@ import numpy as np
 from spectro_gen import generate_aiff_spectrogram
 from sklearn.model_selection import train_test_split
 import itertools
-import pickle,gc
+import gc

-def sunflower_data():
-    audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file'])
-    sunflowers = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
-    sunflowers.loc[:,'file'] = sunflowers.loc[:,'file'].apply(lambda x:'outputs/'+x).apply(generate_aiff_spectrogram)
-    y_data = sunflowers['variant'].apply(lambda x:x=='normal').values
-    max_samples = sunflowers['file'].apply(lambda x:x.shape[0]).max()
-    sample_size = sunflowers['file'][0].shape[1]
-    sample_count = sunflowers['file'].shape[0]
-    sunflowers['file'][0].shape[0]
-    def append_zeros(spgr):
-        orig = spgr.shape[0]
-        return np.lib.pad(spgr,[(0, max_samples-orig), (0,0)],'median')
-    pad_sun = sunflowers['file'].apply(append_zeros).values
-    x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size,))
-    return train_test_split(x_data,y_data,test_size=0.33)

-def get_siamese_pairs(groupF1,groupF2):
-    group1 = [r for (i,r) in groupF1.iterrows()]
-    group2 = [r for (i,r) in groupF2.iterrows()]
-    f = [(g1,g2) for g2 in group2 for g1 in group1]
-    t = [i for i in itertools.combinations(group1,2)]+[i for i in itertools.combinations(group2,2)]
-    return (t,f)
+def get_siamese_pairs(groupF1, groupF2):
+    group1 = [r for (i, r) in groupF1.iterrows()]
+    group2 = [r for (i, r) in groupF2.iterrows()]
+    f = [(g1, g2) for g2 in group2 for g1 in group1]
+    t = [i for i in itertools.combinations(group1, 2)
+         ] + [i for i in itertools.combinations(group2, 2)]
+    return (t, f)
+

 def sunflower_pairs_data():
-    audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file'])
-    audio_samples = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
-    audio_samples.loc[:,'spectrogram'] = audio_samples.loc[:,'file'].apply(lambda x:'outputs/audio/'+x).apply(generate_aiff_spectrogram)
-    max_samples = audio_samples['spectrogram'].apply(lambda x:x.shape[0]).max()
-    sample_size = audio_samples['spectrogram'][0].shape[1]
-    same_data,diff_data = [],[]
-    for (w,g) in audio_samples.groupby(audio_samples['word']):
+    audio_samples = pd.read_csv(
+        './outputs/audio.csv',
+        names=['word', 'voice', 'rate', 'variant', 'file'])
+    audio_samples = audio_samples.loc[audio_samples['word'] ==
+                                      'sunflowers'].reset_index(drop=True)
+    audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply(
+        lambda x: 'outputs/audio/' + x).apply(generate_aiff_spectrogram)
+    max_samples = audio_samples['spectrogram'].apply(
+        lambda x: x.shape[0]).max()
+    same_data, diff_data = [], []
+    for (w, g) in audio_samples.groupby(audio_samples['word']):
        sample_norm = g.loc[audio_samples['variant'] == 'normal']
        sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
-        same , diff = get_siamese_pairs(sample_norm,sample_phon)
+        same, diff = get_siamese_pairs(sample_norm, sample_phon)
        same_data.extend(same)
        diff_data.extend(diff)
-    Y = np.hstack([np.ones(len(same_data)),np.zeros(len(diff_data))])
-    X_sample_pairs = same_data+diff_data
+    Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
+    X_sample_pairs = same_data + diff_data
+
    def append_zeros(spgr):
-        sample = np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median')
-        return np.expand_dims(sample,axis=0)
+        sample = np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
+                            'median')
+        return np.expand_dims(sample, axis=0)
+
    def create_X(sp):
        # sample_count = sp[0]['file'].shape[0]
        l_sample = append_zeros(sp[0]['spectrogram'])
-        r_sample = append_zeros(sp[1]['spectrogram'])#.apply(append_zeros).values
-        # x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size))
-        return np.expand_dims(np.vstack([l_sample,r_sample]),axis=0)
+        r_sample = append_zeros(
+            sp[1]['spectrogram'])
+        return np.expand_dims(np.vstack([l_sample, r_sample]), axis=0)
+
    X_list = (create_X(sp) for sp in X_sample_pairs)
    X = np.vstack(X_list)
-    tr_pairs,te_pairs,tr_y,te_y = train_test_split(X,Y,test_size=0.1)
-    return train_test_split(X,Y,test_size=0.1)
+    tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
+    return train_test_split(X, Y, test_size=0.1)
+

 def create_spectrogram_data(audio_group='audio'):
-    audio_samples = pd.read_csv('./outputs/'+audio_group+'.csv',names=['word','voice','rate','variant','file'])
-    # audio_samples = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
-    audio_samples.loc[:,'spectrogram'] = audio_samples.loc[:,'file'].apply(lambda x:'outputs/'+audio_group+'/'+x).apply(generate_aiff_spectrogram)
+    audio_samples = pd.read_csv(
+        './outputs/' + audio_group + '.csv',
+        names=['word', 'voice', 'rate', 'variant', 'file'])
+    # audio_samples = audio_samples.loc[audio_samples['word'] ==
+    #                                   'sunflowers'].reset_index(drop=True)
+    audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply(
+        lambda x: 'outputs/' + audio_group + '/' + x).apply(
+            generate_aiff_spectrogram)
    audio_samples.to_pickle('outputs/spectrogram.pkl')

+
 def create_speech_pairs_data(audio_group='audio'):
    audio_samples = pd.read_pickle('outputs/spectrogram.pkl')
-    max_samples = audio_samples['spectrogram'].apply(lambda x:x.shape[0]).max()
-    sample_size = audio_samples['spectrogram'][0].shape[1]
+    max_samples = audio_samples['spectrogram'].apply(
+        lambda x: x.shape[0]).max()
+
+    # sample_size = audio_samples['spectrogram'][0].shape[1]

    def append_zeros(spgr):
-        sample = np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median')
+        sample = np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
+                            'median')
        return sample
+
    def create_X(sp):
        l_sample = append_zeros(sp[0]['spectrogram'])
        r_sample = append_zeros(sp[1]['spectrogram'])
-        return np.asarray([l_sample,r_sample])
+        return np.asarray([l_sample, r_sample])

    print('generating siamese speech pairs')
-    same_data,diff_data = [],[]
-    for (w,g) in audio_samples.groupby(audio_samples['word']):
-        sample_norm = g.loc[audio_samples['variant'] == 'normal']#.reset_index(drop=True)
-        sample_phon = g.loc[audio_samples['variant'] == 'phoneme']#.reset_index(drop=True)
-        same , diff = get_siamese_pairs(sample_norm,sample_phon)
+    same_data, diff_data = [], []
+    for (w, g) in audio_samples.groupby(audio_samples['word']):
+        sample_norm = g.loc[audio_samples['variant'] == 'normal']
+        sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
+        same, diff = get_siamese_pairs(sample_norm, sample_phon)
        same_data.extend([create_X(s) for s in same[:10]])
        diff_data.extend([create_X(d) for d in diff[:10]])
    print('creating all speech pairs')
-    Y = np.hstack([np.ones(len(same_data)),np.zeros(len(diff_data))])
+    Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
    print('casting as array speech pairs')
-    X = np.asarray(same_data+diff_data)
+    X = np.asarray(same_data + diff_data)
    print('pickling X/Y')
-    np.save('outputs/X.npy',X)
-    np.save('outputs/Y.npy',Y)
-    del X
+    np.save('outputs/X.npy', X)
+    np.save('outputs/Y.npy', Y)
+    del same_data
+    del diff_data
    gc.collect()
    print('train/test splitting speech pairs')
-    tr_pairs,te_pairs,tr_y,te_y = train_test_split(X,Y,test_size=0.1)
+    tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
    print('pickling train/test')
-    np.save('outputs/tr_pairs.npy',tr_pairs)
-    np.save('outputs/te_pairs.npy',te_pairs)
-    np.save('outputs/tr_y.npy',tr_y)
-    np.save('outputs/te_y.npy',te_y)
+    np.save('outputs/tr_pairs.npy', tr_pairs)
+    np.save('outputs/te_pairs.npy', te_pairs)
+    np.save('outputs/tr_y.npy', tr_y)
+    np.save('outputs/te_y.npy', te_y)

-# def create_speech_model_data():
-#     (max_samples,sample_size) = pickle.load(open('./spectrogram_vars.pkl','rb'))
-#     x_data_pos = np.load('outputs/x_data_pos.npy')
-#     x_data_neg = np.load('outputs/x_data_neg.npy')
-#     x_pos_train, x_pos_test, x_neg_train, x_neg_test =train_test_split(x_data_pos,x_data_neg,test_size=0.1)
-#     del x_data_pos
-#     del x_data_neg
-#     gc.collect()
-#     print('split train and test')
-#     tr_y = np.array(x_pos_train.shape[0]*[1])
-#     te_y = np.array(x_pos_test.shape[0]*[[1,0]])
-#     tr_pairs = np.array([x_pos_train,x_neg_train]).reshape(x_pos_train.shape[0],2,max_samples,sample_size)
-#     te_pairs = np.array([x_pos_test,x_neg_test]).reshape(x_pos_test.shape[0],2,max_samples,sample_size)
-#     print('reshaped to input dim')
-#     np.save('outputs/tr_pairs.npy',tr_pairs)
-#     np.save('outputs/te_pairs.npy',te_pairs)
-#     np.save('outputs/tr_y.npy',tr_y)
-#     np.save('outputs/te_y.npy',te_y)
-#     print('pickled speech model data')

 def speech_model_data():
-    tr_pairs = np.load('outputs/tr_pairs.npy')/255.0
-    te_pairs = np.load('outputs/te_pairs.npy')/255.0
+    tr_pairs = np.load('outputs/tr_pairs.npy') / 255.0
+    te_pairs = np.load('outputs/te_pairs.npy') / 255.0
    tr_pairs[tr_pairs < 0] = 0
    te_pairs[te_pairs < 0] = 0
    tr_y = np.load('outputs/tr_y.npy')
    te_y = np.load('outputs/te_y.npy')
-    return tr_pairs,te_pairs,tr_y,te_y
+    return tr_pairs, te_pairs, tr_y, te_y
+

 if __name__ == '__main__':
    # sunflower_pairs_data()
-    #create_spectrogram_data()
+    # create_spectrogram_data()
    create_speech_pairs_data()
    # print(speech_model_data())
--- a/speech_siamese.py
+++ b/speech_siamese.py
@@ -1,4 +1,3 @@
-
 from __future__ import absolute_import
 from __future__ import print_function
 import numpy as np
@@ -12,8 +11,8 @@ from keras import backend as K

 def euclidean_distance(vects):
    x, y = vects
-    return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True),
-                            K.epsilon()))
+    return K.sqrt(
+        K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))


 def eucl_dist_output_shape(shapes):
@@ -79,31 +78,44 @@ input_b = Input(shape=input_dim)
 processed_a = base_network(input_a)
 processed_b = base_network(input_b)

-distance = Lambda(euclidean_distance,
-                  output_shape=eucl_dist_output_shape)(
-    [processed_a, processed_b]
-)
+distance = Lambda(
+    euclidean_distance,
+    output_shape=eucl_dist_output_shape)([processed_a, processed_b])

 model = Model([input_a, input_b], distance)

-tb_cb = TensorBoard(log_dir='./logs/siamese_logs', histogram_freq=1,
-                    batch_size=32, write_graph=True, write_grads=True,
-                    write_images=True, embeddings_freq=0,
-                    embeddings_layer_names=None, embeddings_metadata=None)
+tb_cb = TensorBoard(
+    log_dir='./logs/siamese_logs',
+    histogram_freq=1,
+    batch_size=32,
+    write_graph=True,
+    write_grads=True,
+    write_images=True,
+    embeddings_freq=0,
+    embeddings_layer_names=None,
+    embeddings_metadata=None)
 cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_acc:0.2f}\
 -acc.h5'
-cp_cb = ModelCheckpoint(cp_file_fmt, monitor='val_acc', verbose=0,
-                        save_best_only=False, save_weights_only=False,
-                        mode='auto', period=1)
+
+cp_cb = ModelCheckpoint(
+    cp_file_fmt,
+    monitor='val_acc',
+    verbose=0,
+    save_best_only=False,
+    save_weights_only=False,
+    mode='auto',
+    period=1)
 # train
 rms = RMSprop(lr=0.001)
 sgd = SGD(lr=0.001)
 model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])
-model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
-          batch_size=128,
-          epochs=50,
-          validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),
-          callbacks=[tb_cb, cp_cb])
+model.fit(
+    [tr_pairs[:, 0], tr_pairs[:, 1]],
+    tr_y,
+    batch_size=128,
+    epochs=50,
+    validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),
+    callbacks=[tb_cb, cp_cb])

 model.save('./models/siamese_speech_model-final.h5')
 # compute final accuracy on training and test sets
--- a/tts_samplegen.py
+++ b/tts_samplegen.py
@@ -1,29 +1,42 @@
 import objc
-from AppKit import NSSpeechSynthesizer,NSSpeechInputModeProperty,NSSpeechModePhoneme
-from Foundation import NSURL,NSError,NSObject
+from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty
+from AppKit import NSSpeechModePhoneme
+from Foundation import NSURL
 import json
 import random
 import os
 import re
 import subprocess

-
 OUTPUT_NAME = 'audio'
-dest_dir = os.path.abspath('.')+'/outputs/'+OUTPUT_NAME+'/'
-dest_file = './outputs/'+OUTPUT_NAME+'.csv'
+dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
+dest_file = './outputs/' + OUTPUT_NAME + '.csv'
+
+
 def create_dir(direc):
    if not os.path.exists(direc):
        os.mkdir(direc)
-dest_filename = lambda n,v,r,t: '{}-{}-{}-{}-'.format(n,v,r,t)+str(random.randint(0,10000))+'.aiff'
-dest_path = lambda v,r,n: dest_dir+v+'/'+r+'/'+n
-dest_url = lambda p: NSURL.fileURLWithPath_(p)

-def cli_gen_audio(speech_cmd,rate,voice,out_path):
-    subprocess.call(['say','-v',voice,'-r',str(rate),'-o',out_path,speech_cmd])
+
+def dest_filename(n, v, r, t):
+    return '{}-{}-{}-{}-'.format(n, v, r,
+                                 t) + str(random.randint(0, 10000)) + '.aiff'
+
+
+def dest_path(v, r, n):
+    return dest_dir + v + '/' + r + '/' + n
+
+
+def cli_gen_audio(speech_cmd, rate, voice, out_path):
+    subprocess.call(
+        ['say', '-v', voice, '-r',
+         str(rate), '-o', out_path, speech_cmd])
+

 class SynthFile(object):
    """docstring for SynthFile."""
-    def __init__(self,word,phon, filename,voice,rate,operation):
+
+    def __init__(self, word, phon, filename, voice, rate, operation):
        super(SynthFile, self).__init__()
        self.word = word
        self.phoneme = phon
@@ -33,91 +46,114 @@ class SynthFile(object):
        self.variant = operation

    def get_json(self):
-        return {'filename':self.filename,'voice':self.voice,
-                'rate':self.rate,'operation':self.operation}
+        return {
+            'filename': self.filename,
+            'voice': self.voice,
+            'rate': self.rate,
+            'operation': self.operation
+        }

    def get_csv(self):
-        return '{},{},{},{},{}\n'.format(self.word,self.phoneme,self.voice,self.rate,self.variant,self.filename)
+        return '{},{},{},{},{}\n'.format(self.word, self.phoneme, self.voice,
+                                         self.rate, self.variant,
+                                         self.filename)
+

 class SynthVariant(object):
    """docstring for SynthVariant."""
-    def __init__(self,identifier,rate):
+
+    def __init__(self, identifier, rate):
        super(SynthVariant, self).__init__()
        self.synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
        self.synth.setVolume_(100)
        self.synth.setRate_(rate)
-        self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(identifier)
+        self.phone_synth = NSSpeechSynthesizer.alloc().initWithVoice_(
+            identifier)
        self.phone_synth.setVolume_(100)
        self.phone_synth.setRate_(rate)
-        self.phone_synth.setObject_forProperty_error_(NSSpeechModePhoneme,NSSpeechInputModeProperty,None)
+        self.phone_synth.setObject_forProperty_error_(
+            NSSpeechModePhoneme, NSSpeechInputModeProperty, None)
        self.identifier = identifier
        self.rate = rate
        self.name = identifier.split('.')[-1]

    def __repr__(self):
-        return 'Synthesizer[{} - {}]({})'.format(self.name,self.rate)
+        return 'Synthesizer[{} - {}]({})'.format(self.name, self.rate)

-    def generate_audio(self,word,variant):
-        orig_phon,phoneme,phon_cmd = self.synth.phonemesFromText_(word),'',word
+    def generate_audio(self, word, variant):
+        orig_phon, phoneme, phon_cmd = self.synth.phonemesFromText_(
+            word), '', word
        if variant == 'low':
            # self.synth.startSpeakingString_toURL_(word,d_url)
            phoneme = orig_phon
        elif variant == 'medium':
-            phoneme = re.sub('[0-9]','',orig_phon)
-            phon_cmd = '[[inpt PHON]] '+phoneme
+            phoneme = re.sub('[0-9]', '', orig_phon)
+            phon_cmd = '[[inpt PHON]] ' + phoneme
        elif variant == 'high':
            phoneme = orig_phon
            phon_cmd = word
        # elif variant == 'long':
-            # if phon != '':
-            # self.phone_synth.startSpeakingString_toURL_(phon,d_url)
-            # else:
-            #     self.synth.startSpeakingString_toURL_(word,d_url)
-        fname = dest_filename(word,phoneme,self.name,self.rate)
-        d_path = dest_path(self.name,self.rate,fname)
-        d_url = dest_url(d_path)
-        cli_gen_audio(phon_cmd,self.rate,self.name,d_path)
-        return SynthFile(word,phoneme,fname,self.name,self.rate,variant)
+        # if phon != '':
+        # self.phone_synth.startSpeakingString_toURL_(phon,d_url)
+        # else:
+        #     self.synth.startSpeakingString_toURL_(word,d_url)
+        fname = dest_filename(word, phoneme, self.name, self.rate)
+        d_path = dest_path(self.name, self.rate, fname)
+        # d_url = NSURL.fileURLWithPath_(d_path)
+        cli_gen_audio(phon_cmd, self.rate, self.name, d_path)
+        return SynthFile(word, phoneme, fname, self.name, self.rate, variant)


 def synth_generator():
    voices_installed = NSSpeechSynthesizer.availableVoices()
-    voice_attrs = [NSSpeechSynthesizer.attributesForVoice_(v) for v in  voices_installed]
-    us_voices_ids = [v['VoiceIdentifier'] for v in voice_attrs if v['VoiceLanguage'] == 'en-US' and v['VoiceIdentifier'].split('.')[-1][0].isupper()]
-    # us_voices_ids = ['com.apple.speech.synthesis.voice.Fred','com.apple.speech.synthesis.voice.Alex',
+    voice_attrs = [
+        NSSpeechSynthesizer.attributesForVoice_(v) for v in voices_installed
+    ]
+    us_voices_ids = [
+        v['VoiceIdentifier'] for v in voice_attrs
+        if v['VoiceLanguage'] == 'en-US'
+        and v['VoiceIdentifier'].split('.')[-1][0].isupper()
+    ]
+    # us_voices_ids = ['com.apple.speech.synthesis.voice.Fred',
+    #                  'com.apple.speech.synthesis.voice.Alex',
    #                  'com.apple.speech.synthesis.voice.Victoria']
    # voice_rates = list(range(150,221,(220-180)//4))
-    voice_rates = [150,180,210,250]
+    voice_rates = [150, 180, 210, 250]
    voice_synths = []
    create_dir(dest_dir)
    for v in us_voices_ids:
        for r in voice_rates:
-            create_dir(dest_dir+v+'/'+r)
-            voice_synths.append(SynthVariant(v,r))
+            create_dir(dest_dir + v + '/' + r)
+            voice_synths.append(SynthVariant(v, r))
+
    def synth_for_words(words):
        all_synths = []
        for w in words:
            for s in voice_synths:
-                for v in ['low','medium','high']:
-                    all_synths.append(s.generate_audio(w,v))
+                for v in ['low', 'medium', 'high']:
+                    all_synths.append(s.generate_audio(w, v))
        return all_synths
+
    return synth_for_words

-def write_synths(synth_list,fname,csv=False):
-    f = open(fname,'w')
+
+def write_synths(synth_list, fname, csv=False):
+    f = open(fname, 'w')
    if csv:
        for s in synth_list:
            f.write(s.get_csv())
    else:
-        json.dump([s.get_json() for s in synth_list],f)
+        json.dump([s.get_json() for s in synth_list], f)
    f.close()

+
 def generate_audio_for_stories():
    stories_data = json.load(open('./inputs/all_stories_hs.json'))
    word_list = [t[0] for i in stories_data.values() for t in i]
    words_audio_synth = synth_generator()
    return words_audio_synth(word_list)

+
 # words_audio_synth = synth_generator()
 # synth = NSSpeechSynthesizer.alloc().init()
 # voices_installed = NSSpeechSynthesizer.availableVoices()
@@ -131,5 +167,5 @@ def generate_audio_for_stories():

 synths = synth_generator()([OUTPUT_NAME])
 # synths = generate_audio_for_stories()
-write_synths(synths,dest_file,True)
+write_synths(synths, dest_file, True)
 # write_synths(synths,'./outputs/synths.json')