1. fixed dimension issue in data

2. experimenting with different base network
master
Malar Kannan 2017-10-23 19:00:27 +05:30
parent e865f17a0d
commit 6f3bca61cf
4 changed files with 577 additions and 89 deletions

446
Siamese.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@ -21,6 +21,9 @@ from keras.layers import Dense, Dropout, Input, Lambda
from keras.optimizers import RMSprop from keras.optimizers import RMSprop
from keras import backend as K from keras import backend as K
%matplotlib inline
import matplotlib.pyplot as plt
num_classes = 10 num_classes = 10
@ -104,8 +107,6 @@ tr_pairs, tr_y = create_pairs(x_train, digit_indices)
digit_indices = [np.where(y_test == i)[0] for i in range(num_classes)] digit_indices = [np.where(y_test == i)[0] for i in range(num_classes)]
te_pairs, te_y = create_pairs(x_test, digit_indices) te_pairs, te_y = create_pairs(x_test, digit_indices)
tr_pairs.shape
# network definition # network definition
base_network = create_base_network(input_dim) base_network = create_base_network(input_dim)

View File

@ -2,6 +2,7 @@ import pandas as pd
import numpy as np import numpy as np
from spectro_gen import generate_aiff_spectrogram from spectro_gen import generate_aiff_spectrogram
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
import itertools
import pickle,gc import pickle,gc
def sunflower_data(): def sunflower_data():
@ -18,43 +19,43 @@ def sunflower_data():
return np.lib.pad(spgr,[(0, max_samples-orig), (0,0)],'median') return np.lib.pad(spgr,[(0, max_samples-orig), (0,0)],'median')
pad_sun = sunflowers['file'].apply(append_zeros).values pad_sun = sunflowers['file'].apply(append_zeros).values
x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size,)) x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size,))
# x_data.shape
# y_data.shape
# train_test_split(x_data,y_data,test_size=0.33)[].shape
# len(train_test_split(x_data,y_data,test_size=0.33))
# sunflowers.loc[:,'file'][0]
# generate_aiff_spectrogram('outputs/sunflowers-Alex-150-normal-589.aiff')
# sunflowers[sunflowers['variant'] == 'phoneme']
# sunflowers[sunflowers['variant'] == 'normal']
# for s in sunflowers.values:
# print(s)
return train_test_split(x_data,y_data,test_size=0.33) return train_test_split(x_data,y_data,test_size=0.33)
def get_siamese_pairs(groupF1,groupF2):
group1 = [r for (i,r) in groupF1.iterrows()]
group2 = [r for (i,r) in groupF2.iterrows()]
f = [(g1,g2) for g2 in group2 for g1 in group1]
t = [i for i in itertools.combinations(group1,2)]+[i for i in itertools.combinations(group2,2)]
return (t,f)
def sunflower_pairs_data(): def sunflower_pairs_data():
audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file']) audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file'])
sunflowers = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True) audio_samples = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
sunflowers.loc[:,'file'] = sunflowers.loc[:,'file'].apply(lambda x:'outputs/'+x).apply(generate_aiff_spectrogram) audio_samples.loc[:,'spectrogram'] = audio_samples.loc[:,'file'].apply(lambda x:'outputs/audio/'+x).apply(generate_aiff_spectrogram)
y_data = sunflowers['variant'].apply(lambda x:x=='normal').values max_samples = audio_samples['spectrogram'].apply(lambda x:x.shape[0]).max()
max_samples = sunflowers['file'].apply(lambda x:x.shape[0]).max() sample_size = audio_samples['spectrogram'][0].shape[1]
sample_size = sunflowers['file'][0].shape[1] same_data,diff_data = [],[]
sunflowers_pos = sunflowers[sunflowers['variant'] == 'normal'].reset_index(drop=True) for (w,g) in audio_samples.groupby(audio_samples['word']):
sunflowers_neg = sunflowers[sunflowers['variant'] == 'phoneme'].reset_index(drop=True) sample_norm = g.loc[audio_samples['variant'] == 'normal']
sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
same , diff = get_siamese_pairs(sample_norm,sample_phon)
same_data.extend(same)
diff_data.extend(diff)
Y = np.hstack([np.ones(len(same_data)),np.zeros(len(diff_data))])
X_sample_pairs = same_data+diff_data
def append_zeros(spgr): def append_zeros(spgr):
return np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median') sample = np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median')
def create_data(sf): return np.expand_dims(sample,axis=0)
sample_count = sf['file'].shape[0] def create_X(sp):
pad_sun = sf['file'].apply(append_zeros).values # sample_count = sp[0]['file'].shape[0]
x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size)) l_sample = append_zeros(sp[0]['spectrogram'])
return x_data r_sample = append_zeros(sp[1]['spectrogram'])#.apply(append_zeros).values
x_data_pos = create_data(sunflowers_pos) # x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size))
x_data_neg = create_data(sunflowers_neg) return np.expand_dims(np.vstack([l_sample,r_sample]),axis=0)
x_pos_train, x_pos_test, x_neg_train, x_neg_test =train_test_split(x_data_pos,x_data_neg,test_size=0.33) X_list = (create_X(sp) for sp in X_sample_pairs)
tr_y = np.array(x_pos_train.shape[0]*[[1,0]]) X = np.vstack(X_list)
te_y = np.array(x_pos_test.shape[0]*[[1,0]]) tr_pairs,te_pairs,tr_y,te_y = train_test_split(X,Y,test_size=0.1)
tr_pairs = np.array([x_pos_train,x_neg_train]).reshape(x_pos_train.shape[0],2,max_samples,sample_size) return train_test_split(X,Y,test_size=0.1)
te_pairs = np.array([x_pos_test,x_neg_test]).reshape(x_pos_test.shape[0],2,max_samples,sample_size)
return tr_pairs,te_pairs,tr_y,te_y
def create_spectrogram_data(audio_group='audio'): def create_spectrogram_data(audio_group='audio'):
audio_samples = pd.read_csv('./outputs/'+audio_group+'.csv',names=['word','voice','rate','variant','file']) audio_samples = pd.read_csv('./outputs/'+audio_group+'.csv',names=['word','voice','rate','variant','file'])
@ -64,58 +65,71 @@ def create_spectrogram_data(audio_group='audio'):
def create_speech_pairs_data(audio_group='audio'): def create_speech_pairs_data(audio_group='audio'):
audio_samples = pd.read_pickle('outputs/spectrogram.pkl') audio_samples = pd.read_pickle('outputs/spectrogram.pkl')
y_data = audio_samples['variant'].apply(lambda x:x=='normal').values
max_samples = audio_samples['spectrogram'].apply(lambda x:x.shape[0]).max() max_samples = audio_samples['spectrogram'].apply(lambda x:x.shape[0]).max()
sample_size = audio_samples['spectrogram'][0].shape[1] sample_size = audio_samples['spectrogram'][0].shape[1]
pickle.dump((max_samples,sample_size),open('./spectrogram_vars.pkl','wb'))
audio_samples_pos = audio_samples[audio_samples['variant'] == 'normal'].reset_index(drop=True)
audio_samples_neg = audio_samples[audio_samples['variant'] == 'phoneme'].reset_index(drop=True)
def append_zeros(spgr):
return np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median')
def create_data(sf):
sample_count = sf['spectrogram'].shape[0]
pad_sun = sf['spectrogram'].apply(append_zeros).values
print('appended zeros')
x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size))
print('reshaped')
return x_data
print('creating speech pair data')
x_data_pos = create_data(audio_samples_pos)
x_data_neg = create_data(audio_samples_neg)
np.save('outputs/x_data_pos.npy',x_data_pos)
np.save('outputs/x_data_neg.npy',x_data_neg)
print('pickled speech pairs')
def create_speech_model_data(): def append_zeros(spgr):
(max_samples,sample_size) = pickle.load(open('./spectrogram_vars.pkl','rb')) sample = np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median')
x_data_pos = np.load('outputs/x_data_pos.npy') return sample
x_data_neg = np.load('outputs/x_data_neg.npy') def create_X(sp):
x_pos_train, x_pos_test, x_neg_train, x_neg_test =train_test_split(x_data_pos,x_data_neg,test_size=0.33) l_sample = append_zeros(sp[0]['spectrogram'])
del x_data_pos r_sample = append_zeros(sp[1]['spectrogram'])
del x_data_neg return np.asarray([l_sample,r_sample])
print('generating siamese speech pairs')
same_data,diff_data = [],[]
for (w,g) in audio_samples.groupby(audio_samples['word']):
sample_norm = g.loc[audio_samples['variant'] == 'normal']#.reset_index(drop=True)
sample_phon = g.loc[audio_samples['variant'] == 'phoneme']#.reset_index(drop=True)
same , diff = get_siamese_pairs(sample_norm,sample_phon)
same_data.extend([create_X(s) for s in same[:10]])
diff_data.extend([create_X(d) for d in diff[:10]])
print('creating all speech pairs')
Y = np.hstack([np.ones(len(same_data)),np.zeros(len(diff_data))])
print('casting as array speech pairs')
X = np.asarray(same_data+diff_data)
print('pickling X/Y')
np.save('outputs/X.npy',X)
np.save('outputs/Y.npy',Y)
del X
gc.collect() gc.collect()
print('split train and test') print('train/test splitting speech pairs')
tr_y = np.array(x_pos_train.shape[0]*[[1,0]]) tr_pairs,te_pairs,tr_y,te_y = train_test_split(X,Y,test_size=0.1)
te_y = np.array(x_pos_test.shape[0]*[[1,0]]) print('pickling train/test')
tr_pairs = np.array([x_pos_train,x_neg_train]).reshape(x_pos_train.shape[0],2,max_samples,sample_size)
te_pairs = np.array([x_pos_test,x_neg_test]).reshape(x_pos_test.shape[0],2,max_samples,sample_size)
print('reshaped to input dim')
np.save('outputs/tr_pairs.npy',tr_pairs) np.save('outputs/tr_pairs.npy',tr_pairs)
np.save('outputs/te_pairs.npy',te_pairs) np.save('outputs/te_pairs.npy',te_pairs)
np.save('outputs/tr_y.npy',tr_y) np.save('outputs/tr_y.npy',tr_y)
np.save('outputs/te_y.npy',te_y) np.save('outputs/te_y.npy',te_y)
print('pickled speech model data')
# return tr_pairs,te_pairs,tr_y,te_y # def create_speech_model_data():
# (max_samples,sample_size) = pickle.load(open('./spectrogram_vars.pkl','rb'))
# x_data_pos = np.load('outputs/x_data_pos.npy')
# x_data_neg = np.load('outputs/x_data_neg.npy')
# x_pos_train, x_pos_test, x_neg_train, x_neg_test =train_test_split(x_data_pos,x_data_neg,test_size=0.1)
# del x_data_pos
# del x_data_neg
# gc.collect()
# print('split train and test')
# tr_y = np.array(x_pos_train.shape[0]*[1])
# te_y = np.array(x_pos_test.shape[0]*[[1,0]])
# tr_pairs = np.array([x_pos_train,x_neg_train]).reshape(x_pos_train.shape[0],2,max_samples,sample_size)
# te_pairs = np.array([x_pos_test,x_neg_test]).reshape(x_pos_test.shape[0],2,max_samples,sample_size)
# print('reshaped to input dim')
# np.save('outputs/tr_pairs.npy',tr_pairs)
# np.save('outputs/te_pairs.npy',te_pairs)
# np.save('outputs/tr_y.npy',tr_y)
# np.save('outputs/te_y.npy',te_y)
# print('pickled speech model data')
def speech_model_data(): def speech_model_data():
tr_pairs = np.load('outputs/tr_pairs.npy') tr_pairs = np.load('outputs/tr_pairs.npy').astype(np.float32)/255.0
te_pairs = np.load('outputs/te_pairs.npy') te_pairs = np.load('outputs/te_pairs.npy').astype(np.float32)/255.0
tr_y = np.load('outputs/tr_y.npy') tr_y = np.load('outputs/tr_y.npy')
te_y = np.load('outputs/te_y.npy') te_y = np.load('outputs/te_y.npy')
return tr_pairs,te_pairs,tr_y,te_y return tr_pairs,te_pairs,tr_y,te_y
if __name__ == '__main__': if __name__ == '__main__':
# sunflower_pairs_data()
#create_spectrogram_data() #create_spectrogram_data()
# create_speech_pairs_data() create_speech_pairs_data()
# create_speech_model_data() # print(speech_model_data())
print(speech_model_data())

View File

@ -14,17 +14,21 @@ from __future__ import absolute_import
from __future__ import print_function from __future__ import print_function
import numpy as np import numpy as np
import random # import random
# from keras.datasets import mnist # from keras.datasets import mnist
from speech_data import speech_model_data from speech_data import speech_model_data
from keras.models import Model from keras.models import Model
from keras.layers import Dense, Dropout, Input, Lambda, LSTM, SimpleRNN from keras.layers import Input, Dense, Dropout, SimpleRNN, LSTM, Lambda
from keras.optimizers import RMSprop # Dense, Dropout, Input, Lambda, LSTM, SimpleRNN
from keras.optimizers import RMSprop, SGD
from keras.callbacks import TensorBoard
from keras import backend as K from keras import backend as K
def euclidean_distance(vects): def euclidean_distance(vects):
x, y = vects x, y = vects
return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon())) return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True),
K.epsilon()))
def eucl_dist_output_shape(shapes): def eucl_dist_output_shape(shapes):
@ -37,20 +41,35 @@ def contrastive_loss(y_true, y_pred):
http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
''' '''
margin = 1 margin = 1
# print(y_true, y_pred)
return K.mean(y_true * K.square(y_pred) + return K.mean(y_true * K.square(y_pred) +
(1 - y_true) * K.square(K.maximum(margin - y_pred, 0))) (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
def create_base_network(input_dim): def create_base_rnn_network(input_dim):
'''Base network to be shared (eq. to feature extraction). '''Base network to be shared (eq. to feature extraction).
''' '''
inp = Input(shape=input_dim) inp = Input(shape=input_dim)
sr1 = SimpleRNN(128)(inp) # d1 = Dense(1024, activation='sigmoid')(inp)
# sr2 = LSTM(128)(sr1) # # d2 = Dense(2, activation='sigmoid')(d1)
# sr2 = SimpleRNN(128)(sr) ls1 = LSTM(1024, return_sequences=True)(inp)
x = Dense(128, activation='relu')(sr1) ls2 = LSTM(512, return_sequences=True)(ls1)
return Model(inp, x) ls3 = LSTM(32)(ls2) # , return_sequences=True
# sr2 = SimpleRNN(128, return_sequences=True)(sr1)
# sr3 = SimpleRNN(32)(sr2)
# x = Dense(128, activation='relu')(sr1)
return Model(inp, ls3)
def create_base_network(input_dim):
'''Base network to be shared (eq. to feature extraction).
'''
input = Input(shape=input_dim)
x = Dense(128, activation='relu')(input)
x = Dropout(0.1)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(128, activation='relu')(x)
return Model(input, x)
def compute_accuracy(y_true, y_pred): def compute_accuracy(y_true, y_pred):
'''Compute classification accuracy with a fixed threshold on distances. '''Compute classification accuracy with a fixed threshold on distances.
@ -66,8 +85,8 @@ def accuracy(y_true, y_pred):
# the data, shuffled and split between train and test sets # the data, shuffled and split between train and test sets
tr_pairs,te_pairs,tr_y,te_y = speech_model_data() tr_pairs, te_pairs, tr_y, te_y = speech_model_data()
# y_train.shape,y_test.shape # y_train.shape,y_test.shape
# x_train.shape,x_test.shape # x_train.shape,x_test.shape
# x_train = x_train.reshape(60000, 784) # x_train = x_train.reshape(60000, 784)
# x_test = x_test.reshape(10000, 784) # x_test = x_test.reshape(10000, 784)
@ -75,11 +94,11 @@ tr_pairs,te_pairs,tr_y,te_y = speech_model_data()
# x_test = x_test.astype('float32') # x_test = x_test.astype('float32')
# x_train /= 255 # x_train /= 255
# x_test /= 255 # x_test /= 255
input_dim = tr_pairs.shape[2:] input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])
epochs = 20 epochs = 20
# network definition # network definition
base_network = create_base_network(input_dim) base_network = create_base_rnn_network(input_dim)
input_a = Input(shape=input_dim) input_a = Input(shape=input_dim)
input_b = Input(shape=input_dim) input_b = Input(shape=input_dim)
@ -90,17 +109,25 @@ processed_a = base_network(input_a)
processed_b = base_network(input_b) processed_b = base_network(input_b)
distance = Lambda(euclidean_distance, distance = Lambda(euclidean_distance,
output_shape=eucl_dist_output_shape)([processed_a, processed_b]) output_shape=eucl_dist_output_shape)(
[processed_a, processed_b]
)
model = Model([input_a, input_b], distance) model = Model([input_a, input_b], distance)
tb_cb = TensorBoard(log_dir='./siamese_logs', histogram_freq=1, batch_size=32,
write_graph=True, write_grads=True, write_images=True,
embeddings_freq=0, embeddings_layer_names=None,
embeddings_metadata=None)
# train # train
rms = RMSprop() rms = RMSprop(lr=0.00001) # lr=0.001)
sgd = SGD(lr=0.001)
model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy]) model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])
model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y, model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
batch_size=128, batch_size=128,
epochs=epochs, epochs=epochs,
validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y)) validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),
callbacks=[tb_cb])
# compute final accuracy on training and test sets # compute final accuracy on training and test sets
y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])