added code to record and generate spectrogram, wip test model
parent
a8f17ef764
commit
f1e82a2539
|
|
@ -1,36 +1,42 @@
|
||||||
import pyaudio
|
import pyaudio
|
||||||
import numpy as np
|
import numpy as np
|
||||||
# from matplotlib import pyplot as plt
|
# from matplotlib import pyplot as plt
|
||||||
from spectro_gen import plot_stft
|
from spectro_gen import plot_stft, generate_spectrogram
|
||||||
|
|
||||||
SAMPLE_RATE = 22050
|
|
||||||
N_SEC = 1.5
|
|
||||||
CHUNKSIZE = int(SAMPLE_RATE * N_SEC) # fixed chunk size
|
|
||||||
|
|
||||||
p_inp = pyaudio.PyAudio()
|
def record_spectrogram(n_sec, plot=False, playback=False):
|
||||||
stream = p_inp.open(
|
SAMPLE_RATE = 22050
|
||||||
format=pyaudio.paFloat32,
|
N_CHANNELS = 2
|
||||||
channels=2,
|
N_SEC = n_sec
|
||||||
rate=SAMPLE_RATE,
|
CHUNKSIZE = int(SAMPLE_RATE * N_SEC / N_CHANNELS) # fixed chunk size
|
||||||
input=True,
|
# show_record_prompt()
|
||||||
frames_per_buffer=CHUNKSIZE)
|
input('Press [Enter] to start recording sample... ')
|
||||||
|
p_inp = pyaudio.PyAudio()
|
||||||
data = stream.read(CHUNKSIZE)
|
stream = p_inp.open(
|
||||||
numpydata = np.frombuffer(data, dtype=np.float32)
|
format=pyaudio.paFloat32,
|
||||||
multi_channel = np.abs(np.reshape(numpydata, (-1, 2))).mean(axis=1)
|
channels=N_CHANNELS,
|
||||||
one_channel = np.asarray([multi_channel, -1 * multi_channel]).T.reshape(-1)
|
rate=SAMPLE_RATE,
|
||||||
mean_channel_data = one_channel.tobytes()
|
input=True,
|
||||||
plot_stft(one_channel, SAMPLE_RATE)
|
frames_per_buffer=CHUNKSIZE)
|
||||||
# plt.plot(one_channel)
|
data = stream.read(CHUNKSIZE)
|
||||||
# plt.show()
|
numpydata = np.frombuffer(data, dtype=np.float32)
|
||||||
|
multi_channel = np.abs(np.reshape(numpydata, (-1, 2))).mean(axis=1)
|
||||||
stream.stop_stream()
|
one_channel = np.asarray([multi_channel, -1 * multi_channel]).T.reshape(-1)
|
||||||
stream.close()
|
mean_channel_data = one_channel.tobytes()
|
||||||
p_inp.terminate()
|
stream.stop_stream()
|
||||||
|
stream.close()
|
||||||
p_oup = pyaudio.PyAudio()
|
p_inp.terminate()
|
||||||
stream = p_oup.open(
|
if plot:
|
||||||
format=pyaudio.paFloat32, channels=2, rate=SAMPLE_RATE, output=True)
|
plot_stft(one_channel, SAMPLE_RATE)
|
||||||
stream.write(mean_channel_data)
|
if playback:
|
||||||
stream.close()
|
p_oup = pyaudio.PyAudio()
|
||||||
p_oup.terminate()
|
stream = p_oup.open(
|
||||||
|
format=pyaudio.paFloat32,
|
||||||
|
channels=2,
|
||||||
|
rate=SAMPLE_RATE,
|
||||||
|
output=True)
|
||||||
|
stream.write(mean_channel_data)
|
||||||
|
stream.close()
|
||||||
|
p_oup.terminate()
|
||||||
|
ims, _ = generate_spectrogram(one_channel, SAMPLE_RATE)
|
||||||
|
return ims
|
||||||
|
|
|
||||||
|
|
@ -128,7 +128,7 @@ def play_sunflower():
|
||||||
sample_r = snd.get_info('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
|
sample_r = snd.get_info('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
|
||||||
snd_data_f64 = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
|
snd_data_f64 = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
|
||||||
snd_data_f32 = snd_data_f64.astype(np.float32)
|
snd_data_f32 = snd_data_f64.astype(np.float32)
|
||||||
snd_data_f32.shape
|
print(snd_data_f32.shape)
|
||||||
snd_data = snd_data_f32.tobytes()
|
snd_data = snd_data_f32.tobytes()
|
||||||
p_oup = pyaudio.PyAudio()
|
p_oup = pyaudio.PyAudio()
|
||||||
stream = p_oup.open(
|
stream = p_oup.open(
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,16 @@ def get_siamese_pairs(groupF1, groupF2):
|
||||||
return (t, f)
|
return (t, f)
|
||||||
|
|
||||||
|
|
||||||
|
def create_X(sp, max_samples):
|
||||||
|
def append_zeros(spgr):
|
||||||
|
return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
|
||||||
|
'median')
|
||||||
|
|
||||||
|
l_sample = append_zeros(sp[0]['spectrogram'])
|
||||||
|
r_sample = append_zeros(sp[1]['spectrogram'])
|
||||||
|
return np.asarray([l_sample, r_sample])
|
||||||
|
|
||||||
|
|
||||||
def sunflower_pairs_data():
|
def sunflower_pairs_data():
|
||||||
audio_samples = pd.read_csv(
|
audio_samples = pd.read_csv(
|
||||||
'./outputs/audio.csv',
|
'./outputs/audio.csv',
|
||||||
|
|
@ -35,19 +45,7 @@ def sunflower_pairs_data():
|
||||||
Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
|
Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
|
||||||
X_sample_pairs = same_data + diff_data
|
X_sample_pairs = same_data + diff_data
|
||||||
|
|
||||||
def append_zeros(spgr):
|
X_list = (create_X(sp, max_samples) for sp in X_sample_pairs)
|
||||||
sample = np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
|
|
||||||
'median')
|
|
||||||
return np.expand_dims(sample, axis=0)
|
|
||||||
|
|
||||||
def create_X(sp):
|
|
||||||
# sample_count = sp[0]['file'].shape[0]
|
|
||||||
l_sample = append_zeros(sp[0]['spectrogram'])
|
|
||||||
r_sample = append_zeros(
|
|
||||||
sp[1]['spectrogram'])
|
|
||||||
return np.expand_dims(np.vstack([l_sample, r_sample]), axis=0)
|
|
||||||
|
|
||||||
X_list = (create_X(sp) for sp in X_sample_pairs)
|
|
||||||
X = np.vstack(X_list)
|
X = np.vstack(X_list)
|
||||||
tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
|
tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
|
||||||
return train_test_split(X, Y, test_size=0.1)
|
return train_test_split(X, Y, test_size=0.1)
|
||||||
|
|
@ -69,27 +67,16 @@ def create_speech_pairs_data(audio_group='audio'):
|
||||||
audio_samples = pd.read_pickle('outputs/spectrogram.pkl')
|
audio_samples = pd.read_pickle('outputs/spectrogram.pkl')
|
||||||
max_samples = audio_samples['spectrogram'].apply(
|
max_samples = audio_samples['spectrogram'].apply(
|
||||||
lambda x: x.shape[0]).max()
|
lambda x: x.shape[0]).max()
|
||||||
|
|
||||||
# sample_size = audio_samples['spectrogram'][0].shape[1]
|
# sample_size = audio_samples['spectrogram'][0].shape[1]
|
||||||
|
|
||||||
def append_zeros(spgr):
|
|
||||||
sample = np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
|
|
||||||
'median')
|
|
||||||
return sample
|
|
||||||
|
|
||||||
def create_X(sp):
|
|
||||||
l_sample = append_zeros(sp[0]['spectrogram'])
|
|
||||||
r_sample = append_zeros(sp[1]['spectrogram'])
|
|
||||||
return np.asarray([l_sample, r_sample])
|
|
||||||
|
|
||||||
print('generating siamese speech pairs')
|
print('generating siamese speech pairs')
|
||||||
same_data, diff_data = [], []
|
same_data, diff_data = [], []
|
||||||
for (w, g) in audio_samples.groupby(audio_samples['word']):
|
for (w, g) in audio_samples.groupby(audio_samples['word']):
|
||||||
sample_norm = g.loc[audio_samples['variant'] == 'normal']
|
sample_norm = g.loc[audio_samples['variant'] == 'normal']
|
||||||
sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
|
sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
|
||||||
same, diff = get_siamese_pairs(sample_norm, sample_phon)
|
same, diff = get_siamese_pairs(sample_norm, sample_phon)
|
||||||
same_data.extend([create_X(s) for s in same[:10]])
|
same_data.extend([create_X(s, max_samples) for s in same[:10]])
|
||||||
diff_data.extend([create_X(d) for d in diff[:10]])
|
diff_data.extend([create_X(d, max_samples) for d in diff[:10]])
|
||||||
print('creating all speech pairs')
|
print('creating all speech pairs')
|
||||||
Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
|
Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
|
||||||
print('casting as array speech pairs')
|
print('casting as array speech pairs')
|
||||||
|
|
|
||||||
|
|
@ -2,9 +2,9 @@ from __future__ import absolute_import
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from speech_data import speech_model_data
|
from speech_data import speech_model_data
|
||||||
from keras.models import Model
|
from keras.models import Model,load_model
|
||||||
from keras.layers import Input, Dense, Dropout, LSTM, Lambda
|
from keras.layers import Input, Dense, Dropout, LSTM, Lambda
|
||||||
from keras.optimizers import RMSprop, SGD
|
from keras.optimizers import RMSprop
|
||||||
from keras.callbacks import TensorBoard, ModelCheckpoint
|
from keras.callbacks import TensorBoard, ModelCheckpoint
|
||||||
from keras import backend as K
|
from keras import backend as K
|
||||||
|
|
||||||
|
|
@ -63,66 +63,86 @@ def accuracy(y_true, y_pred):
|
||||||
return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))
|
return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))
|
||||||
|
|
||||||
|
|
||||||
# the data, shuffled and split between train and test sets
|
def train_siamese():
|
||||||
tr_pairs, te_pairs, tr_y, te_y = speech_model_data()
|
# the data, shuffled and split between train and test sets
|
||||||
input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])
|
tr_pairs, te_pairs, tr_y, te_y = speech_model_data()
|
||||||
|
input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])
|
||||||
|
|
||||||
# network definition
|
# network definition
|
||||||
base_network = create_base_rnn_network(input_dim)
|
base_network = create_base_rnn_network(input_dim)
|
||||||
input_a = Input(shape=input_dim)
|
input_a = Input(shape=input_dim)
|
||||||
input_b = Input(shape=input_dim)
|
input_b = Input(shape=input_dim)
|
||||||
|
|
||||||
# because we re-use the same instance `base_network`,
|
# because we re-use the same instance `base_network`,
|
||||||
# the weights of the network
|
# the weights of the network
|
||||||
# will be shared across the two branches
|
# will be shared across the two branches
|
||||||
processed_a = base_network(input_a)
|
processed_a = base_network(input_a)
|
||||||
processed_b = base_network(input_b)
|
processed_b = base_network(input_b)
|
||||||
|
|
||||||
distance = Lambda(
|
distance = Lambda(
|
||||||
euclidean_distance,
|
euclidean_distance,
|
||||||
output_shape=eucl_dist_output_shape)([processed_a, processed_b])
|
output_shape=eucl_dist_output_shape)([processed_a, processed_b])
|
||||||
|
|
||||||
model = Model([input_a, input_b], distance)
|
model = Model([input_a, input_b], distance)
|
||||||
|
|
||||||
tb_cb = TensorBoard(
|
tb_cb = TensorBoard(
|
||||||
log_dir='./logs/siamese_logs',
|
log_dir='./logs/siamese_logs',
|
||||||
histogram_freq=1,
|
histogram_freq=1,
|
||||||
batch_size=32,
|
batch_size=32,
|
||||||
write_graph=True,
|
write_graph=True,
|
||||||
write_grads=True,
|
write_grads=True,
|
||||||
write_images=True,
|
write_images=True,
|
||||||
embeddings_freq=0,
|
embeddings_freq=0,
|
||||||
embeddings_layer_names=None,
|
embeddings_layer_names=None,
|
||||||
embeddings_metadata=None)
|
embeddings_metadata=None)
|
||||||
cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_acc:0.2f}\
|
cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_acc:0.2f}\
|
||||||
-acc.h5'
|
-acc.h5'
|
||||||
|
|
||||||
cp_cb = ModelCheckpoint(
|
cp_cb = ModelCheckpoint(
|
||||||
cp_file_fmt,
|
cp_file_fmt,
|
||||||
monitor='val_acc',
|
monitor='val_acc',
|
||||||
verbose=0,
|
verbose=0,
|
||||||
save_best_only=False,
|
save_best_only=False,
|
||||||
save_weights_only=False,
|
save_weights_only=False,
|
||||||
mode='auto',
|
mode='auto',
|
||||||
period=1)
|
period=1)
|
||||||
# train
|
# train
|
||||||
rms = RMSprop(lr=0.001)
|
rms = RMSprop(lr=0.001)
|
||||||
sgd = SGD(lr=0.001)
|
model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])
|
||||||
model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])
|
model.fit(
|
||||||
model.fit(
|
[tr_pairs[:, 0], tr_pairs[:, 1]],
|
||||||
[tr_pairs[:, 0], tr_pairs[:, 1]],
|
tr_y,
|
||||||
tr_y,
|
batch_size=128,
|
||||||
batch_size=128,
|
epochs=50,
|
||||||
epochs=50,
|
validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),
|
||||||
validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),
|
callbacks=[tb_cb, cp_cb])
|
||||||
callbacks=[tb_cb, cp_cb])
|
|
||||||
|
|
||||||
model.save('./models/siamese_speech_model-final.h5')
|
model.save('./models/siamese_speech_model-final.h5')
|
||||||
# compute final accuracy on training and test sets
|
# compute final accuracy on training and test sets
|
||||||
y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
|
y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
|
||||||
tr_acc = compute_accuracy(tr_y, y_pred)
|
tr_acc = compute_accuracy(tr_y, y_pred)
|
||||||
y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
|
y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
|
||||||
te_acc = compute_accuracy(te_y, y_pred)
|
te_acc = compute_accuracy(te_y, y_pred)
|
||||||
|
|
||||||
print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
|
print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
|
||||||
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
|
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
|
||||||
|
|
||||||
|
|
||||||
|
def trained_siamese_model():
|
||||||
|
# input_dim = (15, 1654)
|
||||||
|
# base_network = create_base_rnn_network(input_dim)
|
||||||
|
# input_a = Input(shape=input_dim)
|
||||||
|
# input_b = Input(shape=input_dim)
|
||||||
|
# processed_a = base_network(input_a)
|
||||||
|
# processed_b = base_network(input_b)
|
||||||
|
# distance = Lambda(
|
||||||
|
# euclidean_distance,
|
||||||
|
# output_shape=eucl_dist_output_shape)([processed_a, processed_b])
|
||||||
|
#
|
||||||
|
# model = Model([input_a, input_b], distance)
|
||||||
|
model = load_model('./models/siamese_speech_model-final.h5')
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
train_siamese()
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
# from speech_siamese import trained_siamese_model
|
||||||
|
from keras.models import load_model
|
||||||
|
from record_mic_speech import record_spectrogram
|
||||||
|
|
||||||
|
model = load_model('./models/siamese_speech_model-final.h5')
|
||||||
|
spec1 = record_spectrogram(n_sec=1.2)
|
||||||
|
spec2 = record_spectrogram(n_sec=1.2)
|
||||||
Loading…
Reference in New Issue