Compare commits

...

4 Commits

Author SHA1 Message Date
Malar Kannan b3755ad80e updated tested pickling 2017-10-17 19:17:44 +05:30
Malar Kannan 88edcdd239 seprated spectrogram generation code 2017-10-17 19:11:04 +05:30
Malar Kannan 51a6d6e804 added who data method 2017-10-17 19:04:07 +05:30
Malar Kannan 8ae5104201 added spectrogram to model data code and implemented simple rnn model 2017-10-17 18:56:42 +05:30
5 changed files with 208 additions and 3 deletions

1
.gitignore vendored
View File

@ -139,3 +139,4 @@ Temporary Items
outputs/*
inputs/mnist
inputs/audio*

View File

@ -17,7 +17,7 @@ import numpy as np
import random
from keras.datasets import mnist
from keras.models import Model
from keras.layers import Dense, Dropout, Input, Lambda, Recurrent
from keras.layers import Dense, Dropout, Input, Lambda
from keras.optimizers import RMSprop
from keras import backend as K
@ -104,6 +104,7 @@ tr_pairs, tr_y = create_pairs(x_train, digit_indices)
digit_indices = [np.where(y_test == i)[0] for i in range(num_classes)]
te_pairs, te_y = create_pairs(x_test, digit_indices)
tr_pairs.shape
# network definition
base_network = create_base_network(input_dim)

View File

@ -63,7 +63,7 @@ def generate_aiff_spectrogram(audiopath):
samples,samplerate,_ = snd.read(audiopath)
# samplerate, samples = wav.read(audiopath)
# s = stft(samples, binsize)
s = stft(samples, samplerate*150/1000,1.0/3)
s = stft(samples, samplerate*150//1000,1.0/3)
sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate)
ims = 20.*np.log10(np.abs(sshow)/10e-6)
@ -74,7 +74,8 @@ def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"):
samples,samplerate,_ = snd.read(audiopath)
# samplerate, samples = wav.read(audiopath)
# s = stft(samples, binsize)
s = stft(samples, samplerate*150/1000,1.0/3)
# print(samplerate*150//1000)
s = stft(samples, samplerate*150//1000,1.0/3)
sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate)
ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel

90
speech_data.py Normal file
View File

@ -0,0 +1,90 @@
import pandas as pd
import numpy as np
from spectro_gen import generate_aiff_spectrogram
from sklearn.model_selection import train_test_split
import tensorflow as tf
def sunflower_data():
audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file'])
sunflowers = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
sunflowers.loc[:,'file'] = sunflowers.loc[:,'file'].apply(lambda x:'outputs/'+x).apply(generate_aiff_spectrogram)
y_data = sunflowers['variant'].apply(lambda x:x=='normal').values
max_samples = sunflowers['file'].apply(lambda x:x.shape[0]).max()
sample_size = sunflowers['file'][0].shape[1]
sample_count = sunflowers['file'].shape[0]
sunflowers['file'][0].shape[0]
def append_zeros(spgr):
orig = spgr.shape[0]
return np.lib.pad(spgr,[(0, max_samples-orig), (0,0)],'median')
pad_sun = sunflowers['file'].apply(append_zeros).values
x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size,))
# x_data.shape
# y_data.shape
# train_test_split(x_data,y_data,test_size=0.33)[].shape
# len(train_test_split(x_data,y_data,test_size=0.33))
# sunflowers.loc[:,'file'][0]
# generate_aiff_spectrogram('outputs/sunflowers-Alex-150-normal-589.aiff')
# sunflowers[sunflowers['variant'] == 'phoneme']
# sunflowers[sunflowers['variant'] == 'normal']
# for s in sunflowers.values:
# print(s)
return train_test_split(x_data,y_data,test_size=0.33)
def sunflower_pairs_data():
audio_samples = pd.read_csv('./outputs/audio.csv',names=['word','voice','rate','variant','file'])
sunflowers = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
sunflowers.loc[:,'file'] = sunflowers.loc[:,'file'].apply(lambda x:'outputs/'+x).apply(generate_aiff_spectrogram)
y_data = sunflowers['variant'].apply(lambda x:x=='normal').values
max_samples = sunflowers['file'].apply(lambda x:x.shape[0]).max()
sample_size = sunflowers['file'][0].shape[1]
sunflowers_pos = sunflowers[sunflowers['variant'] == 'normal'].reset_index(drop=True)
sunflowers_neg = sunflowers[sunflowers['variant'] == 'phoneme'].reset_index(drop=True)
def append_zeros(spgr):
return np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median')
def create_data(sf):
sample_count = sf['file'].shape[0]
pad_sun = sf['file'].apply(append_zeros).values
x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size))
return x_data
x_data_pos = create_data(sunflowers_pos)
x_data_neg = create_data(sunflowers_neg)
x_pos_train, x_pos_test, x_neg_train, x_neg_test =train_test_split(x_data_pos,x_data_neg,test_size=0.33)
tr_y = np.array(x_pos_train.shape[0]*[[1,0]])
te_y = np.array(x_pos_test.shape[0]*[[1,0]])
tr_pairs = np.array([x_pos_train,x_neg_train]).reshape(x_pos_train.shape[0],2,max_samples,sample_size)
te_pairs = np.array([x_pos_test,x_neg_test]).reshape(x_pos_test.shape[0],2,max_samples,sample_size)
return tr_pairs,te_pairs,tr_y,te_y
def create_spectrogram_data(audio_group='audio'):
audio_samples = pd.read_csv('./outputs/'+audio_group+'.csv',names=['word','voice','rate','variant','file'])
# audio_samples = audio_samples.loc[audio_samples['word'] == 'sunflowers'].reset_index(drop=True)
audio_samples.loc[:,'spectrogram'] = audio_samples.loc[:,'file'].apply(lambda x:'outputs/'+audio_group+'/'+x).apply(generate_aiff_spectrogram)
audio_samples.to_pickle('outputs/spectrogram.pkl')
def speech_pairs_data(audio_group='audio'):
audio_samples = pd.read_pickle('outputs/spectrogram.pkl')
y_data = audio_samples['variant'].apply(lambda x:x=='normal').values
max_samples = audio_samples['spectrogram'].apply(lambda x:x.shape[0]).max()
sample_size = audio_samples['spectrogram'][0].shape[1]
audio_samples_pos = audio_samples[audio_samples['variant'] == 'normal'].reset_index(drop=True)
audio_samples_neg = audio_samples[audio_samples['variant'] == 'phoneme'].reset_index(drop=True)
def append_zeros(spgr):
return np.lib.pad(spgr,[(0, max_samples-spgr.shape[0]), (0,0)],'median')
def create_data(sf):
sample_count = sf['spectrogram'].shape[0]
pad_sun = sf['spectrogram'].apply(append_zeros).values
x_data = np.vstack(pad_sun).reshape((sample_count,max_samples,sample_size))
return x_data
x_data_pos = create_data(audio_samples_pos)
x_data_neg = create_data(audio_samples_neg)
x_pos_train, x_pos_test, x_neg_train, x_neg_test =train_test_split(x_data_pos,x_data_neg,test_size=0.33)
tr_y = np.array(x_pos_train.shape[0]*[[1,0]])
te_y = np.array(x_pos_test.shape[0]*[[1,0]])
tr_pairs = np.array([x_pos_train,x_neg_train]).reshape(x_pos_train.shape[0],2,max_samples,sample_size)
te_pairs = np.array([x_pos_test,x_neg_test]).reshape(x_pos_test.shape[0],2,max_samples,sample_size)
return tr_pairs,te_pairs,tr_y,te_y
if __name__ == '__main__':
create_spectrogram_data()
print(speech_pairs_data())

112
speech_siamese.py Normal file
View File

@ -0,0 +1,112 @@
'''Train a Siamese MLP on pairs of digits from the MNIST dataset.
It follows Hadsell-et-al.'06 [1] by computing the Euclidean distance on the
output of the shared network and by optimizing the contrastive loss (see paper
for mode details).
[1] "Dimensionality Reduction by Learning an Invariant Mapping"
http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
Gets to 97.2% test accuracy after 20 epochs.
2 seconds per epoch on a Titan X Maxwell GPU
'''
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
import random
# from keras.datasets import mnist
from speech_data import sunflower_pairs_data
from keras.models import Model
from keras.layers import Dense, Dropout, Input, Lambda, LSTM, SimpleRNN
from keras.optimizers import RMSprop
from keras import backend as K
def euclidean_distance(vects):
x, y = vects
return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
def eucl_dist_output_shape(shapes):
shape1, shape2 = shapes
return (shape1[0], 1)
def contrastive_loss(y_true, y_pred):
'''Contrastive loss from Hadsell-et-al.'06
http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
'''
margin = 1
return K.mean(y_true * K.square(y_pred) +
(1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
def create_base_network(input_dim):
'''Base network to be shared (eq. to feature extraction).
'''
inp = Input(shape=input_dim)
sr1 = SimpleRNN(128)(inp)
# sr2 = LSTM(128)(sr1)
# sr2 = SimpleRNN(128)(sr)
x = Dense(128, activation='relu')(sr1)
return Model(inp, x)
def compute_accuracy(y_true, y_pred):
'''Compute classification accuracy with a fixed threshold on distances.
'''
pred = y_pred.ravel() < 0.5
return np.mean(pred == y_true)
def accuracy(y_true, y_pred):
'''Compute classification accuracy with a fixed threshold on distances.
'''
return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))
# the data, shuffled and split between train and test sets
tr_pairs,te_pairs,tr_y,te_y = sunflower_pairs_data()
# y_train.shape,y_test.shape
# x_train.shape,x_test.shape
# x_train = x_train.reshape(60000, 784)
# x_test = x_test.reshape(10000, 784)
# x_train = x_train.astype('float32')
# x_test = x_test.astype('float32')
# x_train /= 255
# x_test /= 255
input_dim = tr_pairs.shape[2:]
epochs = 20
# network definition
base_network = create_base_network(input_dim)
input_a = Input(shape=input_dim)
input_b = Input(shape=input_dim)
# because we re-use the same instance `base_network`,
# the weights of the network
# will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)
distance = Lambda(euclidean_distance,
output_shape=eucl_dist_output_shape)([processed_a, processed_b])
model = Model([input_a, input_b], distance)
# train
rms = RMSprop()
model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])
model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
batch_size=128,
epochs=epochs,
validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y))
# compute final accuracy on training and test sets
y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
tr_acc = compute_accuracy(tr_y, y_pred)
y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
te_acc = compute_accuracy(te_y, y_pred)
print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))