added code to record and generate spectrogram, wip test model
parent
a8f17ef764
commit
f1e82a2539
|
|
@ -1,36 +1,42 @@
|
|||
import pyaudio
|
||||
import numpy as np
|
||||
# from matplotlib import pyplot as plt
|
||||
from spectro_gen import plot_stft
|
||||
from spectro_gen import plot_stft, generate_spectrogram
|
||||
|
||||
SAMPLE_RATE = 22050
|
||||
N_SEC = 1.5
|
||||
CHUNKSIZE = int(SAMPLE_RATE * N_SEC) # fixed chunk size
|
||||
|
||||
p_inp = pyaudio.PyAudio()
|
||||
stream = p_inp.open(
|
||||
def record_spectrogram(n_sec, plot=False, playback=False):
|
||||
SAMPLE_RATE = 22050
|
||||
N_CHANNELS = 2
|
||||
N_SEC = n_sec
|
||||
CHUNKSIZE = int(SAMPLE_RATE * N_SEC / N_CHANNELS) # fixed chunk size
|
||||
# show_record_prompt()
|
||||
input('Press [Enter] to start recording sample... ')
|
||||
p_inp = pyaudio.PyAudio()
|
||||
stream = p_inp.open(
|
||||
format=pyaudio.paFloat32,
|
||||
channels=2,
|
||||
channels=N_CHANNELS,
|
||||
rate=SAMPLE_RATE,
|
||||
input=True,
|
||||
frames_per_buffer=CHUNKSIZE)
|
||||
|
||||
data = stream.read(CHUNKSIZE)
|
||||
numpydata = np.frombuffer(data, dtype=np.float32)
|
||||
multi_channel = np.abs(np.reshape(numpydata, (-1, 2))).mean(axis=1)
|
||||
one_channel = np.asarray([multi_channel, -1 * multi_channel]).T.reshape(-1)
|
||||
mean_channel_data = one_channel.tobytes()
|
||||
plot_stft(one_channel, SAMPLE_RATE)
|
||||
# plt.plot(one_channel)
|
||||
# plt.show()
|
||||
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p_inp.terminate()
|
||||
|
||||
p_oup = pyaudio.PyAudio()
|
||||
stream = p_oup.open(
|
||||
format=pyaudio.paFloat32, channels=2, rate=SAMPLE_RATE, output=True)
|
||||
stream.write(mean_channel_data)
|
||||
stream.close()
|
||||
p_oup.terminate()
|
||||
data = stream.read(CHUNKSIZE)
|
||||
numpydata = np.frombuffer(data, dtype=np.float32)
|
||||
multi_channel = np.abs(np.reshape(numpydata, (-1, 2))).mean(axis=1)
|
||||
one_channel = np.asarray([multi_channel, -1 * multi_channel]).T.reshape(-1)
|
||||
mean_channel_data = one_channel.tobytes()
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p_inp.terminate()
|
||||
if plot:
|
||||
plot_stft(one_channel, SAMPLE_RATE)
|
||||
if playback:
|
||||
p_oup = pyaudio.PyAudio()
|
||||
stream = p_oup.open(
|
||||
format=pyaudio.paFloat32,
|
||||
channels=2,
|
||||
rate=SAMPLE_RATE,
|
||||
output=True)
|
||||
stream.write(mean_channel_data)
|
||||
stream.close()
|
||||
p_oup.terminate()
|
||||
ims, _ = generate_spectrogram(one_channel, SAMPLE_RATE)
|
||||
return ims
|
||||
|
|
|
|||
|
|
@ -128,7 +128,7 @@ def play_sunflower():
|
|||
sample_r = snd.get_info('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
|
||||
snd_data_f64 = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
|
||||
snd_data_f32 = snd_data_f64.astype(np.float32)
|
||||
snd_data_f32.shape
|
||||
print(snd_data_f32.shape)
|
||||
snd_data = snd_data_f32.tobytes()
|
||||
p_oup = pyaudio.PyAudio()
|
||||
stream = p_oup.open(
|
||||
|
|
|
|||
|
|
@ -15,6 +15,16 @@ def get_siamese_pairs(groupF1, groupF2):
|
|||
return (t, f)
|
||||
|
||||
|
||||
def create_X(sp, max_samples):
|
||||
def append_zeros(spgr):
|
||||
return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
|
||||
'median')
|
||||
|
||||
l_sample = append_zeros(sp[0]['spectrogram'])
|
||||
r_sample = append_zeros(sp[1]['spectrogram'])
|
||||
return np.asarray([l_sample, r_sample])
|
||||
|
||||
|
||||
def sunflower_pairs_data():
|
||||
audio_samples = pd.read_csv(
|
||||
'./outputs/audio.csv',
|
||||
|
|
@ -35,19 +45,7 @@ def sunflower_pairs_data():
|
|||
Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
|
||||
X_sample_pairs = same_data + diff_data
|
||||
|
||||
def append_zeros(spgr):
|
||||
sample = np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
|
||||
'median')
|
||||
return np.expand_dims(sample, axis=0)
|
||||
|
||||
def create_X(sp):
|
||||
# sample_count = sp[0]['file'].shape[0]
|
||||
l_sample = append_zeros(sp[0]['spectrogram'])
|
||||
r_sample = append_zeros(
|
||||
sp[1]['spectrogram'])
|
||||
return np.expand_dims(np.vstack([l_sample, r_sample]), axis=0)
|
||||
|
||||
X_list = (create_X(sp) for sp in X_sample_pairs)
|
||||
X_list = (create_X(sp, max_samples) for sp in X_sample_pairs)
|
||||
X = np.vstack(X_list)
|
||||
tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
|
||||
return train_test_split(X, Y, test_size=0.1)
|
||||
|
|
@ -69,27 +67,16 @@ def create_speech_pairs_data(audio_group='audio'):
|
|||
audio_samples = pd.read_pickle('outputs/spectrogram.pkl')
|
||||
max_samples = audio_samples['spectrogram'].apply(
|
||||
lambda x: x.shape[0]).max()
|
||||
|
||||
# sample_size = audio_samples['spectrogram'][0].shape[1]
|
||||
|
||||
def append_zeros(spgr):
|
||||
sample = np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
|
||||
'median')
|
||||
return sample
|
||||
|
||||
def create_X(sp):
|
||||
l_sample = append_zeros(sp[0]['spectrogram'])
|
||||
r_sample = append_zeros(sp[1]['spectrogram'])
|
||||
return np.asarray([l_sample, r_sample])
|
||||
|
||||
print('generating siamese speech pairs')
|
||||
same_data, diff_data = [], []
|
||||
for (w, g) in audio_samples.groupby(audio_samples['word']):
|
||||
sample_norm = g.loc[audio_samples['variant'] == 'normal']
|
||||
sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
|
||||
same, diff = get_siamese_pairs(sample_norm, sample_phon)
|
||||
same_data.extend([create_X(s) for s in same[:10]])
|
||||
diff_data.extend([create_X(d) for d in diff[:10]])
|
||||
same_data.extend([create_X(s, max_samples) for s in same[:10]])
|
||||
diff_data.extend([create_X(d, max_samples) for d in diff[:10]])
|
||||
print('creating all speech pairs')
|
||||
Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
|
||||
print('casting as array speech pairs')
|
||||
|
|
|
|||
|
|
@ -2,9 +2,9 @@ from __future__ import absolute_import
|
|||
from __future__ import print_function
|
||||
import numpy as np
|
||||
from speech_data import speech_model_data
|
||||
from keras.models import Model
|
||||
from keras.models import Model,load_model
|
||||
from keras.layers import Input, Dense, Dropout, LSTM, Lambda
|
||||
from keras.optimizers import RMSprop, SGD
|
||||
from keras.optimizers import RMSprop
|
||||
from keras.callbacks import TensorBoard, ModelCheckpoint
|
||||
from keras import backend as K
|
||||
|
||||
|
|
@ -63,28 +63,29 @@ def accuracy(y_true, y_pred):
|
|||
return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))
|
||||
|
||||
|
||||
# the data, shuffled and split between train and test sets
|
||||
tr_pairs, te_pairs, tr_y, te_y = speech_model_data()
|
||||
input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])
|
||||
def train_siamese():
|
||||
# the data, shuffled and split between train and test sets
|
||||
tr_pairs, te_pairs, tr_y, te_y = speech_model_data()
|
||||
input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])
|
||||
|
||||
# network definition
|
||||
base_network = create_base_rnn_network(input_dim)
|
||||
input_a = Input(shape=input_dim)
|
||||
input_b = Input(shape=input_dim)
|
||||
# network definition
|
||||
base_network = create_base_rnn_network(input_dim)
|
||||
input_a = Input(shape=input_dim)
|
||||
input_b = Input(shape=input_dim)
|
||||
|
||||
# because we re-use the same instance `base_network`,
|
||||
# the weights of the network
|
||||
# will be shared across the two branches
|
||||
processed_a = base_network(input_a)
|
||||
processed_b = base_network(input_b)
|
||||
# because we re-use the same instance `base_network`,
|
||||
# the weights of the network
|
||||
# will be shared across the two branches
|
||||
processed_a = base_network(input_a)
|
||||
processed_b = base_network(input_b)
|
||||
|
||||
distance = Lambda(
|
||||
distance = Lambda(
|
||||
euclidean_distance,
|
||||
output_shape=eucl_dist_output_shape)([processed_a, processed_b])
|
||||
|
||||
model = Model([input_a, input_b], distance)
|
||||
model = Model([input_a, input_b], distance)
|
||||
|
||||
tb_cb = TensorBoard(
|
||||
tb_cb = TensorBoard(
|
||||
log_dir='./logs/siamese_logs',
|
||||
histogram_freq=1,
|
||||
batch_size=32,
|
||||
|
|
@ -94,10 +95,10 @@ tb_cb = TensorBoard(
|
|||
embeddings_freq=0,
|
||||
embeddings_layer_names=None,
|
||||
embeddings_metadata=None)
|
||||
cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_acc:0.2f}\
|
||||
-acc.h5'
|
||||
cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_acc:0.2f}\
|
||||
-acc.h5'
|
||||
|
||||
cp_cb = ModelCheckpoint(
|
||||
cp_cb = ModelCheckpoint(
|
||||
cp_file_fmt,
|
||||
monitor='val_acc',
|
||||
verbose=0,
|
||||
|
|
@ -105,11 +106,10 @@ cp_cb = ModelCheckpoint(
|
|||
save_weights_only=False,
|
||||
mode='auto',
|
||||
period=1)
|
||||
# train
|
||||
rms = RMSprop(lr=0.001)
|
||||
sgd = SGD(lr=0.001)
|
||||
model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])
|
||||
model.fit(
|
||||
# train
|
||||
rms = RMSprop(lr=0.001)
|
||||
model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])
|
||||
model.fit(
|
||||
[tr_pairs[:, 0], tr_pairs[:, 1]],
|
||||
tr_y,
|
||||
batch_size=128,
|
||||
|
|
@ -117,12 +117,32 @@ model.fit(
|
|||
validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),
|
||||
callbacks=[tb_cb, cp_cb])
|
||||
|
||||
model.save('./models/siamese_speech_model-final.h5')
|
||||
# compute final accuracy on training and test sets
|
||||
y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
|
||||
tr_acc = compute_accuracy(tr_y, y_pred)
|
||||
y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
|
||||
te_acc = compute_accuracy(te_y, y_pred)
|
||||
model.save('./models/siamese_speech_model-final.h5')
|
||||
# compute final accuracy on training and test sets
|
||||
y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
|
||||
tr_acc = compute_accuracy(tr_y, y_pred)
|
||||
y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
|
||||
te_acc = compute_accuracy(te_y, y_pred)
|
||||
|
||||
print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
|
||||
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
|
||||
print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
|
||||
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
|
||||
|
||||
|
||||
def trained_siamese_model():
|
||||
# input_dim = (15, 1654)
|
||||
# base_network = create_base_rnn_network(input_dim)
|
||||
# input_a = Input(shape=input_dim)
|
||||
# input_b = Input(shape=input_dim)
|
||||
# processed_a = base_network(input_a)
|
||||
# processed_b = base_network(input_b)
|
||||
# distance = Lambda(
|
||||
# euclidean_distance,
|
||||
# output_shape=eucl_dist_output_shape)([processed_a, processed_b])
|
||||
#
|
||||
# model = Model([input_a, input_b], distance)
|
||||
model = load_model('./models/siamese_speech_model-final.h5')
|
||||
return model
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
train_siamese()
|
||||
|
|
|
|||
|
|
@ -0,0 +1,7 @@
|
|||
# from speech_siamese import trained_siamese_model
|
||||
from keras.models import load_model
|
||||
from record_mic_speech import record_spectrogram
|
||||
|
||||
model = load_model('./models/siamese_speech_model-final.h5')
|
||||
spec1 = record_spectrogram(n_sec=1.2)
|
||||
spec2 = record_spectrogram(n_sec=1.2)
|
||||
Loading…
Reference in New Issue