diff --git a/segment_data.py b/segment_data.py index d115d10..b6472a1 100644 --- a/segment_data.py +++ b/segment_data.py @@ -176,6 +176,69 @@ def create_segments_tfrecords(collection_name='story_test_segments',sample_count const_file = './outputs/segments/'+collection_name+'/constants.pkl' pickle.dump((n_spec,n_features,n_records),open(const_file,'wb')) +def record_generator_count(records_file): + record_iterator = tf.python_io.tf_record_iterator(path=records_file) + count,spec_n = 0,0 + for i in record_iterator: + count+=1 + record_iterator = tf.python_io.tf_record_iterator(path=records_file) + return record_iterator,count + +def read_segments_tfrecords_generator(collection_name='audio',batch_size=32,test_size=0): + records_file = './outputs/segments/'+collection_name+'/train.tfrecords' + const_file = './outputs/segments/'+collection_name+'/constants.pkl' + (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) + + def copy_read_consts(dest_dir): + shutil.copy2(const_file,dest_dir+'/constants.pkl') + return (n_spec,n_features,n_records) + # @threadsafe_iter + def record_generator(): + print('reading tfrecords({}-train)...'.format(collection_name)) + input_data = [] + output_data = [] + while True: + record_iterator,records_count = record_generator_count(records_file) + for (i,string_record) in enumerate(record_iterator): + example = tf.train.Example() + example.ParseFromString(string_record) + spec_n = example.features.feature['spec_n'].int64_list.value[0] + spec_w = example.features.feature['spec_w'].int64_list.value[0] + spec = np.array(example.features.feature['spec'].float_list.value).reshape(spec_n,spec_w) + spec = np.array(example.features.feature['output'].int64_list.value) + p_spec = padd_zeros(spec,n_spec) + input_data.append(p_spec) + output = example.features.feature['output'].int64_list.value + output_data.append(np.asarray(output)) + if len(input_data) == batch_size or i == n_records-1: + input_arr = np.asarray(input_data) + output_arr = np.asarray(output_data) + yield (input_arr,output_arr) + input_data = [] + output_data = [] + + # Read test in one-shot + # collection_name = 'story_test' + print('reading tfrecords({}-test)...'.format(collection_name)) + te_records_file = './outputs/segments/'+collection_name+'/test.tfrecords' + te_re_iterator,te_n_records = record_generator_count(te_records_file) + test_size = min([test_size,te_n_records]) if test_size > 0 else te_n_records + input_data = np.zeros((test_size,2,n_spec,n_features)) + output_data = np.zeros((test_size,2)) + random_samples = enumerate(reservoir_sample(te_re_iterator,test_size)) + for (i,string_record) in tqdm(random_samples,total=test_size): + example = tf.train.Example() + example.ParseFromString(string_record) + spec_n = example.features.feature['spec_n'].int64_list.value[0] + spec_w = example.features.feature['spec_w'].int64_list.value[0] + spec = np.array(example.features.feature['spec'].float_list.value).reshape(spec_n,spec_w) + p_spec = padd_zeros(spec,n_spec) + input_data[i] = p_spec + output = example.features.feature['output'].int64_list.value + output_data[i] = np.asarray(output) + + return record_generator,input_data,output_data,copy_read_consts + if __name__ == '__main__': # plot_random_phrases() # fix_csv('story_test_segments') diff --git a/segment_model.py b/segment_model.py index 1bb6a5d..c548ede 100644 --- a/segment_model.py +++ b/segment_model.py @@ -12,7 +12,13 @@ from keras.callbacks import TensorBoard, ModelCheckpoint from keras import backend as K from keras.utils import plot_model from speech_tools import create_dir,step_count -from segment_data import segment_data_gen +from segment_data import read_segments_tfrecords_generator + +import importlib +import segment_data +import speech_tools +importlib.reload(segment_data) +importlib.reload(speech_tools) # TODO implement ctc losses @@ -48,22 +54,16 @@ def segment_model(input_dim): return Model(inp, oup) def simple_segment_model(input_dim): - input_dim = (100,100,1) + # input_dim = (100,100) inp = Input(shape=input_dim) - cnv1 = Conv2D(filters=32, kernel_size=(5,9))(inp) - cnv2 = Conv2D(filters=1, kernel_size=(5,9))(cnv1) - dr_cnv2 = Dropout(rate=0.95)(cnv2) - # dr_cnv2 - cn_rnn_dim = (dr_cnv2.shape[1].value,dr_cnv2.shape[2].value) - r_dr_cnv2 = Reshape(target_shape=cn_rnn_dim)(dr_cnv2) - b_gr1 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(r_dr_cnv2) + b_gr1 = Bidirectional(GRU(256, return_sequences=True),merge_mode='sum')(inp) # b_gr1 - b_gr2 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(b_gr1) - b_gr3 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(b_gr2) + b_gr2 = Bidirectional(GRU(64, return_sequences=True),merge_mode='sum')(b_gr1) + b_gr3 = Bidirectional(GRU(1, return_sequences=True),merge_mode='sum')(b_gr2) # b_gr3 - oup = Dense(2, activation='softmax')(b_gr3) + # oup = Dense(2, activation='softmax')(b_gr3) # oup - return Model(inp, oup) + return Model(inp, b_gr3) def write_model_arch(mod,mod_file): model_f = open(mod_file,'w') @@ -77,15 +77,16 @@ def load_model_arch(mod_file): return mod def train_segment(collection_name = 'test'): + collection_name = 'story_test' batch_size = 128 model_dir = './models/segment/'+collection_name create_dir(model_dir) log_dir = './logs/segment/'+collection_name create_dir(log_dir) - tr_gen_fn = segment_data_gen() + tr_gen_fn,inp,oup,copy_read_consts = read_segments_tfrecords_generator(collection_name,batch_size,2*batch_size) tr_gen = tr_gen_fn() + n_step,n_features,n_records = copy_read_consts(model_dir) input_dim = (n_step, n_features) - model = simple_segment_model(input_dim) plot_model(model,show_shapes=True, to_file=model_dir+'/model.png') # loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length]) diff --git a/speech_data.py b/speech_data.py index 7fa7735..db94d04 100644 --- a/speech_data.py +++ b/speech_data.py @@ -9,7 +9,6 @@ from speech_spectrum import generate_aiff_spectrogram from speech_pitch import pitch_array from speech_pitch import compute_mfcc from sklearn.model_selection import train_test_split -import itertools import os,shutil import random import csv @@ -168,7 +167,7 @@ def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,test_size # Read test in one-shot print('reading tfrecords({}-test)...'.format(audio_group)) te_records_file = os.path.join('./outputs',audio_group+'.test.tfrecords') - te_re_iterator,te_n_records = record_generator_count(records_file) + te_re_iterator,te_n_records = record_generator_count(te_records_file) test_size = min([test_size,te_n_records]) if test_size > 0 else te_n_records input_data = np.zeros((test_size,2,n_spec,n_features)) output_data = np.zeros((test_size,2)) diff --git a/speech_tools.py b/speech_tools.py index eedf15d..aa22525 100644 --- a/speech_tools.py +++ b/speech_tools.py @@ -2,6 +2,8 @@ import os import math import string import threading +import itertools +import random import multiprocessing import pandas as pd import numpy as np