parent
3d297f176f
commit
7fc89c0853
|
|
@ -41,7 +41,7 @@ def create_spectrogram_tfrecords(audio_group='audio',sample_count=0,train_test_r
|
||||||
http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/
|
http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/
|
||||||
http://www.machinelearninguru.com/deep_learning/tensorflow/basics/tfrecord/tfrecord.html
|
http://www.machinelearninguru.com/deep_learning/tensorflow/basics/tfrecord/tfrecord.html
|
||||||
'''
|
'''
|
||||||
audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv',index_col=0)
|
audio_samples = pd.read_csv( './outputs/' + audio_group + '.fixed.csv',index_col=0)
|
||||||
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
|
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
|
||||||
n_records,n_spec,n_features = 0,0,0
|
n_records,n_spec,n_features = 0,0,0
|
||||||
|
|
||||||
|
|
@ -205,19 +205,19 @@ def record_generator_count(records_file):
|
||||||
return record_iterator,count
|
return record_iterator,count
|
||||||
|
|
||||||
def fix_csv(audio_group='audio'):
|
def fix_csv(audio_group='audio'):
|
||||||
audio_csv_lines = open('./outputs/' + audio_group + '.csv.orig','r').readlines()
|
audio_csv_lines = open('./outputs/' + audio_group + '.csv','r').readlines()
|
||||||
audio_csv_data = [i.strip().split(',') for i in audio_csv_lines]
|
audio_csv_data = [i.strip().split(',') for i in audio_csv_lines]
|
||||||
proper_rows = [i for i in audio_csv_data if len(i) == 7]
|
proper_rows = [i for i in audio_csv_data if len(i) == 7]
|
||||||
with open('./outputs/' + audio_group + '.csv','w') as fixed_csv:
|
with open('./outputs/' + audio_group + '.fixed.csv','w') as fixed_csv:
|
||||||
fixed_csv_w = csv.writer(fixed_csv, quoting=csv.QUOTE_MINIMAL)
|
fixed_csv_w = csv.writer(fixed_csv, quoting=csv.QUOTE_MINIMAL)
|
||||||
fixed_csv_w.writerows(proper_rows)
|
fixed_csv_w.writerows(proper_rows)
|
||||||
audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv'
|
audio_samples = pd.read_csv( './outputs/' + audio_group + '.fixed.csv'
|
||||||
, names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'])
|
, names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'])
|
||||||
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
|
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
|
||||||
audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists)
|
audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists)
|
||||||
audio_samples = audio_samples[audio_samples['file_exists'] == True]
|
audio_samples = audio_samples[audio_samples['file_exists'] == True]
|
||||||
audio_samples = audio_samples.drop(['file_path','file_exists'],axis=1).reset_index(drop=True)
|
audio_samples = audio_samples.drop(['file_path','file_exists'],axis=1).reset_index(drop=True)
|
||||||
audio_samples.to_csv('./outputs/' + audio_group + '.csv')
|
audio_samples.to_csv('./outputs/' + audio_group + '.fixed.csv')
|
||||||
|
|
||||||
def convert_old_audio():
|
def convert_old_audio():
|
||||||
audio_samples = pd.read_csv( './outputs/audio.csv.old'
|
audio_samples = pd.read_csv( './outputs/audio.csv.old'
|
||||||
|
|
@ -243,7 +243,8 @@ if __name__ == '__main__':
|
||||||
# create_spectrogram_tfrecords('audio',sample_count=100)
|
# create_spectrogram_tfrecords('audio',sample_count=100)
|
||||||
# create_spectrogram_tfrecords('story_all',sample_count=25)
|
# create_spectrogram_tfrecords('story_all',sample_count=25)
|
||||||
# fix_csv('story_words_test')
|
# fix_csv('story_words_test')
|
||||||
create_spectrogram_tfrecords('story_words_test',sample_count=100,train_test_ratio=0.0)
|
fix_csv('story_phrases')
|
||||||
|
create_spectrogram_tfrecords('story_phrases',sample_count=100,train_test_ratio=0.3)
|
||||||
# create_spectrogram_tfrecords('audio',sample_count=50)
|
# create_spectrogram_tfrecords('audio',sample_count=50)
|
||||||
# read_siamese_tfrecords_generator('audio')
|
# read_siamese_tfrecords_generator('audio')
|
||||||
# padd_zeros_siamese_tfrecords('audio')
|
# padd_zeros_siamese_tfrecords('audio')
|
||||||
|
|
|
||||||
|
|
@ -1,36 +1,16 @@
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
import numpy as np
|
import numpy as np
|
||||||
# from speech_data import speech_model_data
|
|
||||||
from speech_data import read_siamese_tfrecords_generator
|
from speech_data import read_siamese_tfrecords_generator
|
||||||
from keras.models import Model,load_model,model_from_yaml
|
from keras.models import Model,load_model,model_from_yaml
|
||||||
from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate
|
from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate
|
||||||
from keras.losses import categorical_crossentropy
|
from keras.losses import categorical_crossentropy
|
||||||
# from keras.losses import binary_crossentropy
|
|
||||||
from keras.utils import to_categorical
|
from keras.utils import to_categorical
|
||||||
# from keras.utils.np_utils import to_categorical
|
|
||||||
from keras.optimizers import RMSprop
|
from keras.optimizers import RMSprop
|
||||||
from keras.callbacks import TensorBoard, ModelCheckpoint
|
from keras.callbacks import TensorBoard, ModelCheckpoint
|
||||||
from keras import backend as K
|
from keras import backend as K
|
||||||
from speech_tools import create_dir,step_count
|
from speech_tools import create_dir,step_count
|
||||||
|
|
||||||
# def euclidean_distance(vects):
|
|
||||||
# x, y = vects
|
|
||||||
# return K.sqrt(
|
|
||||||
# K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# def eucl_dist_output_shape(shapes):
|
|
||||||
# shape1, shape2 = shapes
|
|
||||||
# return (shape1[0], 1)
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# def contrastive_loss(y_true, y_pred):
|
|
||||||
# '''Contrastive loss from Hadsell-et-al.'06
|
|
||||||
# http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
|
|
||||||
# '''
|
|
||||||
# return K.mean(y_true * K.square(y_pred) +
|
|
||||||
# (1 - y_true) * K.square(K.maximum(1 - y_pred, 0)))
|
|
||||||
|
|
||||||
def create_base_rnn_network(input_dim):
|
def create_base_rnn_network(input_dim):
|
||||||
'''Base network to be shared (eq. to feature extraction).
|
'''Base network to be shared (eq. to feature extraction).
|
||||||
|
|
@ -68,7 +48,6 @@ def dense_classifier(processed):
|
||||||
return Dense(2, activation='softmax')(d3)
|
return Dense(2, activation='softmax')(d3)
|
||||||
|
|
||||||
def siamese_model(input_dim):
|
def siamese_model(input_dim):
|
||||||
# input_dim = (15, 1654)
|
|
||||||
base_network = create_base_rnn_network(input_dim)
|
base_network = create_base_rnn_network(input_dim)
|
||||||
input_a = Input(shape=input_dim)
|
input_a = Input(shape=input_dim)
|
||||||
input_b = Input(shape=input_dim)
|
input_b = Input(shape=input_dim)
|
||||||
|
|
@ -94,8 +73,6 @@ def load_model_arch(mod_file):
|
||||||
return mod
|
return mod
|
||||||
|
|
||||||
def train_siamese(audio_group = 'audio'):
|
def train_siamese(audio_group = 'audio'):
|
||||||
# the data, shuffled and split between train and test sets
|
|
||||||
# tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data()
|
|
||||||
batch_size = 256
|
batch_size = 256
|
||||||
model_dir = './models/'+audio_group
|
model_dir = './models/'+audio_group
|
||||||
create_dir(model_dir)
|
create_dir(model_dir)
|
||||||
|
|
@ -103,8 +80,6 @@ def train_siamese(audio_group = 'audio'):
|
||||||
create_dir(log_dir)
|
create_dir(log_dir)
|
||||||
tr_gen_fn,te_pairs,te_y,n_step,n_features,n_records = read_siamese_tfrecords_generator(audio_group,batch_size=batch_size,test_size=batch_size)
|
tr_gen_fn,te_pairs,te_y,n_step,n_features,n_records = read_siamese_tfrecords_generator(audio_group,batch_size=batch_size,test_size=batch_size)
|
||||||
tr_gen = tr_gen_fn()
|
tr_gen = tr_gen_fn()
|
||||||
# tr_y = to_categorical(tr_y_e, num_classes=2)
|
|
||||||
# te_y = to_categorical(te_y_e, num_classes=2)
|
|
||||||
input_dim = (n_step, n_features)
|
input_dim = (n_step, n_features)
|
||||||
|
|
||||||
model = siamese_model(input_dim)
|
model = siamese_model(input_dim)
|
||||||
|
|
@ -131,29 +106,17 @@ def train_siamese(audio_group = 'audio'):
|
||||||
mode='auto',
|
mode='auto',
|
||||||
period=1)
|
period=1)
|
||||||
# train
|
# train
|
||||||
rms = RMSprop()#lr=0.001
|
rms = RMSprop()
|
||||||
model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy])
|
model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy])
|
||||||
write_model_arch(model,model_dir+'/siamese_speech_model_arch.yaml')
|
write_model_arch(model,model_dir+'/siamese_speech_model_arch.yaml')
|
||||||
# model.fit(
|
|
||||||
# [tr_pairs[:, 0], tr_pairs[:, 1]],
|
|
||||||
# tr_y,
|
|
||||||
# batch_size=128,
|
|
||||||
# epochs=100,
|
|
||||||
# validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),
|
|
||||||
# callbacks=[tb_cb, cp_cb])
|
|
||||||
epoch_n_steps = step_count(n_records,batch_size)
|
epoch_n_steps = step_count(n_records,batch_size)
|
||||||
model.fit_generator(tr_gen
|
model.fit_generator(tr_gen
|
||||||
, epochs=1000
|
, epochs=1000
|
||||||
, steps_per_epoch=epoch_n_steps
|
, steps_per_epoch=epoch_n_steps
|
||||||
, validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y)
|
, validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y)
|
||||||
# ,use_multiprocessing=True, workers=1
|
|
||||||
, max_queue_size=32
|
, max_queue_size=32
|
||||||
, callbacks=[tb_cb, cp_cb])
|
, callbacks=[tb_cb, cp_cb])
|
||||||
model.save(model_dir+'/siamese_speech_model-final.h5')
|
model.save(model_dir+'/siamese_speech_model-final.h5')
|
||||||
# compute final accuracy on training and test sets
|
|
||||||
# y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
|
|
||||||
# tr_acc = compute_accuracy(tr_y, y_pred)
|
|
||||||
# print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
|
|
||||||
|
|
||||||
y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
|
y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
|
||||||
te_acc = compute_accuracy(te_y, y_pred)
|
te_acc = compute_accuracy(te_y, y_pred)
|
||||||
|
|
@ -162,5 +125,4 @@ def train_siamese(audio_group = 'audio'):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
train_siamese('story_words_test')
|
train_siamese('story_phrases')
|
||||||
# train_siamese('audio')
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue