added code to record and generate spectrogram, wip test model

master
Malar Kannan 2017-10-25 15:38:03 +05:30
parent a8f17ef764
commit f1e82a2539
5 changed files with 135 additions and 115 deletions

View File

@ -1,36 +1,42 @@
import pyaudio import pyaudio
import numpy as np import numpy as np
# from matplotlib import pyplot as plt # from matplotlib import pyplot as plt
from spectro_gen import plot_stft from spectro_gen import plot_stft, generate_spectrogram
def record_spectrogram(n_sec, plot=False, playback=False):
SAMPLE_RATE = 22050 SAMPLE_RATE = 22050
N_SEC = 1.5 N_CHANNELS = 2
CHUNKSIZE = int(SAMPLE_RATE * N_SEC) # fixed chunk size N_SEC = n_sec
CHUNKSIZE = int(SAMPLE_RATE * N_SEC / N_CHANNELS) # fixed chunk size
# show_record_prompt()
input('Press [Enter] to start recording sample... ')
p_inp = pyaudio.PyAudio() p_inp = pyaudio.PyAudio()
stream = p_inp.open( stream = p_inp.open(
format=pyaudio.paFloat32, format=pyaudio.paFloat32,
channels=2, channels=N_CHANNELS,
rate=SAMPLE_RATE, rate=SAMPLE_RATE,
input=True, input=True,
frames_per_buffer=CHUNKSIZE) frames_per_buffer=CHUNKSIZE)
data = stream.read(CHUNKSIZE) data = stream.read(CHUNKSIZE)
numpydata = np.frombuffer(data, dtype=np.float32) numpydata = np.frombuffer(data, dtype=np.float32)
multi_channel = np.abs(np.reshape(numpydata, (-1, 2))).mean(axis=1) multi_channel = np.abs(np.reshape(numpydata, (-1, 2))).mean(axis=1)
one_channel = np.asarray([multi_channel, -1 * multi_channel]).T.reshape(-1) one_channel = np.asarray([multi_channel, -1 * multi_channel]).T.reshape(-1)
mean_channel_data = one_channel.tobytes() mean_channel_data = one_channel.tobytes()
plot_stft(one_channel, SAMPLE_RATE)
# plt.plot(one_channel)
# plt.show()
stream.stop_stream() stream.stop_stream()
stream.close() stream.close()
p_inp.terminate() p_inp.terminate()
if plot:
plot_stft(one_channel, SAMPLE_RATE)
if playback:
p_oup = pyaudio.PyAudio() p_oup = pyaudio.PyAudio()
stream = p_oup.open( stream = p_oup.open(
format=pyaudio.paFloat32, channels=2, rate=SAMPLE_RATE, output=True) format=pyaudio.paFloat32,
channels=2,
rate=SAMPLE_RATE,
output=True)
stream.write(mean_channel_data) stream.write(mean_channel_data)
stream.close() stream.close()
p_oup.terminate() p_oup.terminate()
ims, _ = generate_spectrogram(one_channel, SAMPLE_RATE)
return ims

View File

@ -128,7 +128,7 @@ def play_sunflower():
sample_r = snd.get_info('./outputs/sunflowers-Alex-150-normal-589.aiff')[0] sample_r = snd.get_info('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
snd_data_f64 = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')[0] snd_data_f64 = snd.read('./outputs/sunflowers-Alex-150-normal-589.aiff')[0]
snd_data_f32 = snd_data_f64.astype(np.float32) snd_data_f32 = snd_data_f64.astype(np.float32)
snd_data_f32.shape print(snd_data_f32.shape)
snd_data = snd_data_f32.tobytes() snd_data = snd_data_f32.tobytes()
p_oup = pyaudio.PyAudio() p_oup = pyaudio.PyAudio()
stream = p_oup.open( stream = p_oup.open(

View File

@ -15,6 +15,16 @@ def get_siamese_pairs(groupF1, groupF2):
return (t, f) return (t, f)
def create_X(sp, max_samples):
def append_zeros(spgr):
return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
'median')
l_sample = append_zeros(sp[0]['spectrogram'])
r_sample = append_zeros(sp[1]['spectrogram'])
return np.asarray([l_sample, r_sample])
def sunflower_pairs_data(): def sunflower_pairs_data():
audio_samples = pd.read_csv( audio_samples = pd.read_csv(
'./outputs/audio.csv', './outputs/audio.csv',
@ -35,19 +45,7 @@ def sunflower_pairs_data():
Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))]) Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
X_sample_pairs = same_data + diff_data X_sample_pairs = same_data + diff_data
def append_zeros(spgr): X_list = (create_X(sp, max_samples) for sp in X_sample_pairs)
sample = np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
'median')
return np.expand_dims(sample, axis=0)
def create_X(sp):
# sample_count = sp[0]['file'].shape[0]
l_sample = append_zeros(sp[0]['spectrogram'])
r_sample = append_zeros(
sp[1]['spectrogram'])
return np.expand_dims(np.vstack([l_sample, r_sample]), axis=0)
X_list = (create_X(sp) for sp in X_sample_pairs)
X = np.vstack(X_list) X = np.vstack(X_list)
tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1) tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
return train_test_split(X, Y, test_size=0.1) return train_test_split(X, Y, test_size=0.1)
@ -69,27 +67,16 @@ def create_speech_pairs_data(audio_group='audio'):
audio_samples = pd.read_pickle('outputs/spectrogram.pkl') audio_samples = pd.read_pickle('outputs/spectrogram.pkl')
max_samples = audio_samples['spectrogram'].apply( max_samples = audio_samples['spectrogram'].apply(
lambda x: x.shape[0]).max() lambda x: x.shape[0]).max()
# sample_size = audio_samples['spectrogram'][0].shape[1] # sample_size = audio_samples['spectrogram'][0].shape[1]
def append_zeros(spgr):
sample = np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
'median')
return sample
def create_X(sp):
l_sample = append_zeros(sp[0]['spectrogram'])
r_sample = append_zeros(sp[1]['spectrogram'])
return np.asarray([l_sample, r_sample])
print('generating siamese speech pairs') print('generating siamese speech pairs')
same_data, diff_data = [], [] same_data, diff_data = [], []
for (w, g) in audio_samples.groupby(audio_samples['word']): for (w, g) in audio_samples.groupby(audio_samples['word']):
sample_norm = g.loc[audio_samples['variant'] == 'normal'] sample_norm = g.loc[audio_samples['variant'] == 'normal']
sample_phon = g.loc[audio_samples['variant'] == 'phoneme'] sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
same, diff = get_siamese_pairs(sample_norm, sample_phon) same, diff = get_siamese_pairs(sample_norm, sample_phon)
same_data.extend([create_X(s) for s in same[:10]]) same_data.extend([create_X(s, max_samples) for s in same[:10]])
diff_data.extend([create_X(d) for d in diff[:10]]) diff_data.extend([create_X(d, max_samples) for d in diff[:10]])
print('creating all speech pairs') print('creating all speech pairs')
Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))]) Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
print('casting as array speech pairs') print('casting as array speech pairs')

View File

@ -2,9 +2,9 @@ from __future__ import absolute_import
from __future__ import print_function from __future__ import print_function
import numpy as np import numpy as np
from speech_data import speech_model_data from speech_data import speech_model_data
from keras.models import Model from keras.models import Model,load_model
from keras.layers import Input, Dense, Dropout, LSTM, Lambda from keras.layers import Input, Dense, Dropout, LSTM, Lambda
from keras.optimizers import RMSprop, SGD from keras.optimizers import RMSprop
from keras.callbacks import TensorBoard, ModelCheckpoint from keras.callbacks import TensorBoard, ModelCheckpoint
from keras import backend as K from keras import backend as K
@ -63,6 +63,7 @@ def accuracy(y_true, y_pred):
return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype))) return K.mean(K.equal(y_true, K.cast(y_pred < 0.5, y_true.dtype)))
def train_siamese():
# the data, shuffled and split between train and test sets # the data, shuffled and split between train and test sets
tr_pairs, te_pairs, tr_y, te_y = speech_model_data() tr_pairs, te_pairs, tr_y, te_y = speech_model_data()
input_dim = (tr_pairs.shape[2], tr_pairs.shape[3]) input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])
@ -107,7 +108,6 @@ cp_cb = ModelCheckpoint(
period=1) period=1)
# train # train
rms = RMSprop(lr=0.001) rms = RMSprop(lr=0.001)
sgd = SGD(lr=0.001)
model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy]) model.compile(loss=contrastive_loss, optimizer=rms, metrics=[accuracy])
model.fit( model.fit(
[tr_pairs[:, 0], tr_pairs[:, 1]], [tr_pairs[:, 0], tr_pairs[:, 1]],
@ -126,3 +126,23 @@ te_acc = compute_accuracy(te_y, y_pred)
print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc)) print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc)) print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
def trained_siamese_model():
# input_dim = (15, 1654)
# base_network = create_base_rnn_network(input_dim)
# input_a = Input(shape=input_dim)
# input_b = Input(shape=input_dim)
# processed_a = base_network(input_a)
# processed_b = base_network(input_b)
# distance = Lambda(
# euclidean_distance,
# output_shape=eucl_dist_output_shape)([processed_a, processed_b])
#
# model = Model([input_a, input_b], distance)
model = load_model('./models/siamese_speech_model-final.h5')
return model
if __name__ == '__main__':
train_siamese()

7
test_siamese.py Normal file
View File

@ -0,0 +1,7 @@
# from speech_siamese import trained_siamese_model
from keras.models import load_model
from record_mic_speech import record_spectrogram
model = load_model('./models/siamese_speech_model-final.h5')
spec1 = record_spectrogram(n_sec=1.2)
spec2 = record_spectrogram(n_sec=1.2)