1. fixed neg values in spectrogram

2. refactored get word spectrogram code
master
Malar Kannan 2017-10-25 16:52:45 +05:30
parent f1e82a2539
commit 5824158af2
4 changed files with 52 additions and 49 deletions

View File

@ -78,6 +78,7 @@ def generate_spectrogram(samples, samplerate):
sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate)
ims = 20. * np.log10(np.abs(sshow) / 10e-6)
ims[ims<0] = 0
return ims, freq

View File

@ -15,40 +15,47 @@ def get_siamese_pairs(groupF1, groupF2):
return (t, f)
def create_X(sp, max_samples):
def append_zeros(spgr):
def append_zeros(spgr, max_samples):
return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
'median')
l_sample = append_zeros(sp[0]['spectrogram'])
r_sample = append_zeros(sp[1]['spectrogram'])
def create_pair(l, r, max_samples):
l_sample = append_zeros(l, max_samples)
r_sample = append_zeros(r, max_samples)
return np.asarray([l_sample, r_sample])
def sunflower_pairs_data():
def create_test_pair(l, r, max_samples):
l_sample = append_zeros(l, max_samples)
r_sample = append_zeros(r, max_samples)
return np.asarray([[l_sample, r_sample]])
def create_X(sp, max_samples):
return create_pair(sp[0]['spectrogram'],sp[1]['spectrogram'],max_samples)
def get_word_pairs_data(word,max_samples):
audio_samples = pd.read_csv(
'./outputs/audio.csv',
names=['word', 'voice', 'rate', 'variant', 'file'])
audio_samples = audio_samples.loc[audio_samples['word'] ==
'sunflowers'].reset_index(drop=True)
word].reset_index(drop=True)
audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply(
lambda x: 'outputs/audio/' + x).apply(generate_aiff_spectrogram)
max_samples = audio_samples['spectrogram'].apply(
lambda x: x.shape[0]).max()
# max_samples = audio_samples['spectrogram'].apply(
# lambda x: x.shape[0]).max()
same_data, diff_data = [], []
for (w, g) in audio_samples.groupby(audio_samples['word']):
sample_norm = g.loc[audio_samples['variant'] == 'normal']
sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
same, diff = get_siamese_pairs(sample_norm, sample_phon)
same_data.extend(same)
diff_data.extend(diff)
same_data.extend([create_X(s, max_samples) for s in same[:10]])
diff_data.extend([create_X(d, max_samples) for d in diff[:10]])
Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
X_sample_pairs = same_data + diff_data
X_list = (create_X(sp, max_samples) for sp in X_sample_pairs)
X = np.vstack(X_list)
tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
return train_test_split(X, Y, test_size=0.1)
X = np.asarray(same_data + diff_data)
# tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
return (X,Y)
def create_spectrogram_data(audio_group='audio'):

View File

@ -63,27 +63,27 @@ def accuracy(y_true, y_pred):
return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))
def train_siamese():
# the data, shuffled and split between train and test sets
tr_pairs, te_pairs, tr_y, te_y = speech_model_data()
input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])
# network definition
def siamese_model(input_dim):
# input_dim = (15, 1654)
base_network = create_base_rnn_network(input_dim)
input_a = Input(shape=input_dim)
input_b = Input(shape=input_dim)
# because we re-use the same instance `base_network`,
# the weights of the network
# will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)
distance = Lambda(
euclidean_distance,
output_shape=eucl_dist_output_shape)([processed_a, processed_b])
model = Model([input_a, input_b], distance)
return model
def train_siamese():
# the data, shuffled and split between train and test sets
tr_pairs, te_pairs, tr_y, te_y = speech_model_data()
input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])
model = siamese_model(input_dim)
tb_cb = TensorBoard(
log_dir='./logs/siamese_logs',
@ -128,21 +128,6 @@ def train_siamese():
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
def trained_siamese_model():
# input_dim = (15, 1654)
# base_network = create_base_rnn_network(input_dim)
# input_a = Input(shape=input_dim)
# input_b = Input(shape=input_dim)
# processed_a = base_network(input_a)
# processed_b = base_network(input_b)
# distance = Lambda(
# euclidean_distance,
# output_shape=eucl_dist_output_shape)([processed_a, processed_b])
#
# model = Model([input_a, input_b], distance)
model = load_model('./models/siamese_speech_model-final.h5')
return model
if __name__ == '__main__':
train_siamese()

View File

@ -1,7 +1,17 @@
# from speech_siamese import trained_siamese_model
from keras.models import load_model
from speech_siamese import siamese_model
from record_mic_speech import record_spectrogram
from importlib import reload
import speech_data
reload(speech_data)
from speech_data import create_test_pair,get_word_pairs_data
import numpy as np
model = load_model('./models/siamese_speech_model-final.h5')
spec1 = record_spectrogram(n_sec=1.2)
spec2 = record_spectrogram(n_sec=1.2)
sunflower_data,sunflower_result = get_word_pairs_data('sunflowers',15)
sunflower_result
model = siamese_model((15, 1654))
model.load_weights('./models/siamese_speech_model-final.h5')
spec1 = record_spectrogram(n_sec=1.4)
spec2 = record_spectrogram(n_sec=1.4)
inp = create_test_pair(spec1,spec2,16)
model.predict([inp[:, 0], inp[:, 1]])
model.predict([sunflower_data[:, 0], sunflower_data[:, 1]]) < 0.5