parent
f1e82a2539
commit
5824158af2
|
|
@ -78,6 +78,7 @@ def generate_spectrogram(samples, samplerate):
|
||||||
|
|
||||||
sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate)
|
sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate)
|
||||||
ims = 20. * np.log10(np.abs(sshow) / 10e-6)
|
ims = 20. * np.log10(np.abs(sshow) / 10e-6)
|
||||||
|
ims[ims<0] = 0
|
||||||
return ims, freq
|
return ims, freq
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,40 +15,47 @@ def get_siamese_pairs(groupF1, groupF2):
|
||||||
return (t, f)
|
return (t, f)
|
||||||
|
|
||||||
|
|
||||||
def create_X(sp, max_samples):
|
def append_zeros(spgr, max_samples):
|
||||||
def append_zeros(spgr):
|
return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
|
||||||
return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
|
'median')
|
||||||
'median')
|
|
||||||
|
|
||||||
l_sample = append_zeros(sp[0]['spectrogram'])
|
|
||||||
r_sample = append_zeros(sp[1]['spectrogram'])
|
def create_pair(l, r, max_samples):
|
||||||
|
l_sample = append_zeros(l, max_samples)
|
||||||
|
r_sample = append_zeros(r, max_samples)
|
||||||
return np.asarray([l_sample, r_sample])
|
return np.asarray([l_sample, r_sample])
|
||||||
|
|
||||||
|
|
||||||
def sunflower_pairs_data():
|
def create_test_pair(l, r, max_samples):
|
||||||
|
l_sample = append_zeros(l, max_samples)
|
||||||
|
r_sample = append_zeros(r, max_samples)
|
||||||
|
return np.asarray([[l_sample, r_sample]])
|
||||||
|
|
||||||
|
def create_X(sp, max_samples):
|
||||||
|
return create_pair(sp[0]['spectrogram'],sp[1]['spectrogram'],max_samples)
|
||||||
|
|
||||||
|
|
||||||
|
def get_word_pairs_data(word,max_samples):
|
||||||
audio_samples = pd.read_csv(
|
audio_samples = pd.read_csv(
|
||||||
'./outputs/audio.csv',
|
'./outputs/audio.csv',
|
||||||
names=['word', 'voice', 'rate', 'variant', 'file'])
|
names=['word', 'voice', 'rate', 'variant', 'file'])
|
||||||
audio_samples = audio_samples.loc[audio_samples['word'] ==
|
audio_samples = audio_samples.loc[audio_samples['word'] ==
|
||||||
'sunflowers'].reset_index(drop=True)
|
word].reset_index(drop=True)
|
||||||
audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply(
|
audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply(
|
||||||
lambda x: 'outputs/audio/' + x).apply(generate_aiff_spectrogram)
|
lambda x: 'outputs/audio/' + x).apply(generate_aiff_spectrogram)
|
||||||
max_samples = audio_samples['spectrogram'].apply(
|
# max_samples = audio_samples['spectrogram'].apply(
|
||||||
lambda x: x.shape[0]).max()
|
# lambda x: x.shape[0]).max()
|
||||||
same_data, diff_data = [], []
|
same_data, diff_data = [], []
|
||||||
for (w, g) in audio_samples.groupby(audio_samples['word']):
|
for (w, g) in audio_samples.groupby(audio_samples['word']):
|
||||||
sample_norm = g.loc[audio_samples['variant'] == 'normal']
|
sample_norm = g.loc[audio_samples['variant'] == 'normal']
|
||||||
sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
|
sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
|
||||||
same, diff = get_siamese_pairs(sample_norm, sample_phon)
|
same, diff = get_siamese_pairs(sample_norm, sample_phon)
|
||||||
same_data.extend(same)
|
same_data.extend([create_X(s, max_samples) for s in same[:10]])
|
||||||
diff_data.extend(diff)
|
diff_data.extend([create_X(d, max_samples) for d in diff[:10]])
|
||||||
Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
|
Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
|
||||||
X_sample_pairs = same_data + diff_data
|
X = np.asarray(same_data + diff_data)
|
||||||
|
# tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
|
||||||
X_list = (create_X(sp, max_samples) for sp in X_sample_pairs)
|
return (X,Y)
|
||||||
X = np.vstack(X_list)
|
|
||||||
tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
|
|
||||||
return train_test_split(X, Y, test_size=0.1)
|
|
||||||
|
|
||||||
|
|
||||||
def create_spectrogram_data(audio_group='audio'):
|
def create_spectrogram_data(audio_group='audio'):
|
||||||
|
|
|
||||||
|
|
@ -63,27 +63,27 @@ def accuracy(y_true, y_pred):
|
||||||
return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))
|
return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))
|
||||||
|
|
||||||
|
|
||||||
def train_siamese():
|
def siamese_model(input_dim):
|
||||||
# the data, shuffled and split between train and test sets
|
# input_dim = (15, 1654)
|
||||||
tr_pairs, te_pairs, tr_y, te_y = speech_model_data()
|
|
||||||
input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])
|
|
||||||
|
|
||||||
# network definition
|
|
||||||
base_network = create_base_rnn_network(input_dim)
|
base_network = create_base_rnn_network(input_dim)
|
||||||
input_a = Input(shape=input_dim)
|
input_a = Input(shape=input_dim)
|
||||||
input_b = Input(shape=input_dim)
|
input_b = Input(shape=input_dim)
|
||||||
|
|
||||||
# because we re-use the same instance `base_network`,
|
|
||||||
# the weights of the network
|
|
||||||
# will be shared across the two branches
|
|
||||||
processed_a = base_network(input_a)
|
processed_a = base_network(input_a)
|
||||||
processed_b = base_network(input_b)
|
processed_b = base_network(input_b)
|
||||||
|
|
||||||
distance = Lambda(
|
distance = Lambda(
|
||||||
euclidean_distance,
|
euclidean_distance,
|
||||||
output_shape=eucl_dist_output_shape)([processed_a, processed_b])
|
output_shape=eucl_dist_output_shape)([processed_a, processed_b])
|
||||||
|
|
||||||
model = Model([input_a, input_b], distance)
|
model = Model([input_a, input_b], distance)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def train_siamese():
|
||||||
|
# the data, shuffled and split between train and test sets
|
||||||
|
tr_pairs, te_pairs, tr_y, te_y = speech_model_data()
|
||||||
|
input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])
|
||||||
|
|
||||||
|
model = siamese_model(input_dim)
|
||||||
|
|
||||||
tb_cb = TensorBoard(
|
tb_cb = TensorBoard(
|
||||||
log_dir='./logs/siamese_logs',
|
log_dir='./logs/siamese_logs',
|
||||||
|
|
@ -128,21 +128,6 @@ def train_siamese():
|
||||||
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
|
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
|
||||||
|
|
||||||
|
|
||||||
def trained_siamese_model():
|
|
||||||
# input_dim = (15, 1654)
|
|
||||||
# base_network = create_base_rnn_network(input_dim)
|
|
||||||
# input_a = Input(shape=input_dim)
|
|
||||||
# input_b = Input(shape=input_dim)
|
|
||||||
# processed_a = base_network(input_a)
|
|
||||||
# processed_b = base_network(input_b)
|
|
||||||
# distance = Lambda(
|
|
||||||
# euclidean_distance,
|
|
||||||
# output_shape=eucl_dist_output_shape)([processed_a, processed_b])
|
|
||||||
#
|
|
||||||
# model = Model([input_a, input_b], distance)
|
|
||||||
model = load_model('./models/siamese_speech_model-final.h5')
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
train_siamese()
|
train_siamese()
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,17 @@
|
||||||
# from speech_siamese import trained_siamese_model
|
from speech_siamese import siamese_model
|
||||||
from keras.models import load_model
|
|
||||||
from record_mic_speech import record_spectrogram
|
from record_mic_speech import record_spectrogram
|
||||||
|
from importlib import reload
|
||||||
|
import speech_data
|
||||||
|
reload(speech_data)
|
||||||
|
from speech_data import create_test_pair,get_word_pairs_data
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
model = load_model('./models/siamese_speech_model-final.h5')
|
sunflower_data,sunflower_result = get_word_pairs_data('sunflowers',15)
|
||||||
spec1 = record_spectrogram(n_sec=1.2)
|
sunflower_result
|
||||||
spec2 = record_spectrogram(n_sec=1.2)
|
model = siamese_model((15, 1654))
|
||||||
|
model.load_weights('./models/siamese_speech_model-final.h5')
|
||||||
|
spec1 = record_spectrogram(n_sec=1.4)
|
||||||
|
spec2 = record_spectrogram(n_sec=1.4)
|
||||||
|
inp = create_test_pair(spec1,spec2,16)
|
||||||
|
model.predict([inp[:, 0], inp[:, 1]])
|
||||||
|
model.predict([sunflower_data[:, 0], sunflower_data[:, 1]]) < 0.5
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue