From bcf1041bde1cbb0b14d7c14c8cb3c8e9bbf11d3d Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Wed, 6 Dec 2017 17:32:26 +0530 Subject: [PATCH] created segment sample tfrecord writer --- segment_data.py | 125 +++++++++++++++++++++++++++++++++++++++++++---- segment_model.py | 20 +++++++- speech_data.py | 17 +------ speech_pitch.py | 1 - speech_tools.py | 14 ++++++ 5 files changed, 150 insertions(+), 27 deletions(-) diff --git a/segment_data.py b/segment_data.py index 72daa74..d115d10 100644 --- a/segment_data.py +++ b/segment_data.py @@ -1,14 +1,24 @@ -import pandas as pd -import numpy as np import random +import math +import pickle from functools import reduce +from tqdm import tqdm +from sklearn.model_selection import train_test_split +import numpy as np +import pandas as pd +import tensorflow as tf + from speech_pitch import * +from speech_tools import reservoir_sample # %matplotlib inline +SPEC_MAX_FREQUENCY = 8000 +SPEC_WINDOW_SIZE = 0.03 + def fix_csv(collection_name = 'test'): - seg_data = pd.read_csv('./outputs/'+collection_name+'.csv',names=['phrase','filename' + seg_data = pd.read_csv('./outputs/segments/'+collection_name+'/index.csv',names=['phrase','filename' ,'start_phoneme','end_phoneme','start_time','end_time']) - seg_data.to_csv('./outputs/'+collection_name+'.fixed.csv') + seg_data.to_csv('./outputs/segments/'+collection_name+'/index.fixed.csv') def pick_random_phrases(collection_name='test'): collection_name = 'test' @@ -22,14 +32,13 @@ def pick_random_phrases(collection_name='test'): # pick_random_phrases() def plot_random_phrases(collection_name = 'test'): - collection_name = 'test' + # collection_name = 'test' rand_words = pd.read_csv('./outputs/'+collection_name+'.random.csv',index_col=0) rand_w_list = rand_words['phrase'].tolist() seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0) result = (seg_data['phrase'] == rand_w_list[0]) for i in rand_w_list[1:]: result |= (seg_data['phrase'] == i) - # seg_data[result] phrase_groups = [i for i in seg_data[result].groupby(['phrase'])] self_files = ['a_wrong_turn-low1.aiff','great_pin-low1.aiff' ,'he_set_off_at_once_to_find_the_beast-low1.aiff' @@ -71,7 +80,105 @@ def plot_random_phrases(collection_name = 'test'): # # plot_sample_intensity(phon_sample) # print(phon_ch) # plot_sample_pitch(phon_sample) - closer() - # print(phg)#['start_phoneme'],g['start_time']) + # closer() -plot_random_phrases() +def plot_segments(collection_name = 'story_test_segments'): + collection_name = 'story_test_segments' + seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0) + phrase_groups = [i for i in seg_data.groupby(['phrase'])] + for (ph,g) in phrase_groups: + # ph,g = phrase_groups[0] + file_path = './outputs/'+collection_name+'/'+g.iloc[0]['filename'] + phrase_sample = pm_snd(file_path) + # player,closer = play_sound() + print(ph) + phon_stops = [] + for (i,phon) in g.iterrows(): + end_t = phon['end_time']/1000 + phon_ch = phon['start_phoneme'] + phon_stops.append((end_t,phon_ch)) + phrase_spec = phrase_sample.to_spectrogram(window_length=0.03, maximum_frequency=8000) + sg_db = 10 * np.log10(phrase_spec.values) + result = np.zeros(sg_db.shape[0],dtype=np.int32) + ph_bounds = [t[0] for t in phon_stops[1:]] + b_frames = np.asarray([spec_frame(phrase_spec,b) for b in ph_bounds]) + result[b_frames] = 1 + # print(audio) + +def generate_spec(aiff_file): + phrase_sample = pm_snd(aiff_file) + phrase_spec = phrase_sample.to_spectrogram(window_length=SPEC_WINDOW_SIZE, maximum_frequency=SPEC_MAX_FREQUENCY) + sg_db = 10 * np.log10(phrase_spec.values) + return sg_db,phrase_spec + + +def spec_frame(spec,b): + return int(round(spec.frame_number_to_time(b))) + +def _float_feature(value): + return tf.train.Feature(float_list=tf.train.FloatList(value=value)) + +def _int64_feature(value): + return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) + +def _bytes_feature(value): + return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) + +def create_segments_tfrecords(collection_name='story_test_segments',sample_count=0,train_test_ratio=0.1): + audio_samples = pd.read_csv( './outputs/segments/' + collection_name + '/index.fixed.csv',index_col=0) + audio_samples['file_path'] = audio_samples.loc[:, 'filename'].apply(lambda x: 'outputs/segments/' + collection_name + '/samples/' + x) + n_records,n_spec,n_features = 0,0,0 + + def write_samples(wg,sample_name): + phrase_groups = tqdm(wg,desc='Computing segmentation') + record_file = './outputs/segments/{}/{}.tfrecords'.format(collection_name,sample_name) + writer = tf.python_io.TFRecordWriter(record_file) + for (ph,g) in phrase_groups: + fname = g.iloc[0]['filename'] + sg_db,phrase_spec = generate_spec(g.iloc[0]['file_path']) + phon_stops = [] + spec_n,spec_w = sg_db.shape + spec = sg_db.reshape(-1) + for (i,phon) in g.iterrows(): + end_t = phon['end_time']/1000 + phon_ch = phon['start_phoneme'] + phon_stops.append((end_t,phon_ch)) + result = np.zeros(spec_n,dtype=np.int32) + ph_bounds = [t[0] for t in phon_stops] + f_bounds = [spec_frame(phrase_spec,b) for b in ph_bounds] + valid_bounds = [i for i in f_bounds if 0 < i < spec_n] + b_frames = np.asarray(valid_bounds) + # print(spec_n,b_frames) + result[b_frames] = 1 + nonlocal n_records,n_spec,n_features + n_spec = max([n_spec,spec_n]) + n_features = spec_w + n_records+=1 + example = tf.train.Example(features=tf.train.Features( + feature={ + 'phrase': _bytes_feature([ph.encode('utf-8')]), + 'file': _bytes_feature([fname.encode('utf-8')]), + 'spec':_float_feature(spec), + 'spec_n1':_int64_feature([spec_n]), + 'spec_w1':_int64_feature([spec_w]), + 'output':_int64_feature(result) + } + )) + writer.write(example.SerializeToString()) + phrase_groups.close() + writer.close() + + word_groups = [i for i in audio_samples.groupby('phrase')] + wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups + tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=train_test_ratio) + write_samples(tr_audio_samples,'train') + write_samples(te_audio_samples,'test') + const_file = './outputs/segments/'+collection_name+'/constants.pkl' + pickle.dump((n_spec,n_features,n_records),open(const_file,'wb')) + +if __name__ == '__main__': + # plot_random_phrases() + # fix_csv('story_test_segments') + # plot_segments('story_test_segments') + # fix_csv('story_test') + create_segments_tfrecords('story_test') diff --git a/segment_model.py b/segment_model.py index d69be6d..1bb6a5d 100644 --- a/segment_model.py +++ b/segment_model.py @@ -47,6 +47,24 @@ def segment_model(input_dim): # oup return Model(inp, oup) +def simple_segment_model(input_dim): + input_dim = (100,100,1) + inp = Input(shape=input_dim) + cnv1 = Conv2D(filters=32, kernel_size=(5,9))(inp) + cnv2 = Conv2D(filters=1, kernel_size=(5,9))(cnv1) + dr_cnv2 = Dropout(rate=0.95)(cnv2) + # dr_cnv2 + cn_rnn_dim = (dr_cnv2.shape[1].value,dr_cnv2.shape[2].value) + r_dr_cnv2 = Reshape(target_shape=cn_rnn_dim)(dr_cnv2) + b_gr1 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(r_dr_cnv2) + # b_gr1 + b_gr2 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(b_gr1) + b_gr3 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(b_gr2) + # b_gr3 + oup = Dense(2, activation='softmax')(b_gr3) + # oup + return Model(inp, oup) + def write_model_arch(mod,mod_file): model_f = open(mod_file,'w') model_f.write(mod.to_yaml()) @@ -68,7 +86,7 @@ def train_segment(collection_name = 'test'): tr_gen = tr_gen_fn() input_dim = (n_step, n_features) - model = segment_model(input_dim) + model = simple_segment_model(input_dim) plot_model(model,show_shapes=True, to_file=model_dir+'/model.png') # loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length]) tb_cb = TensorBoard( diff --git a/speech_data.py b/speech_data.py index 7eff4fb..a8b2ed6 100644 --- a/speech_data.py +++ b/speech_data.py @@ -1,5 +1,5 @@ import pandas as pd -from speech_tools import apply_by_multiprocessing,threadsafe_iter +from speech_tools import apply_by_multiprocessing,threadsafe_iter,reservoir_sample # import dask as dd # import dask.dataframe as ddf import tensorflow as tf @@ -124,21 +124,6 @@ def padd_zeros(spgr, max_samples): return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], 'constant') -def reservoir_sample(iterable, k): - it = iter(iterable) - if not (k > 0): - raise ValueError("sample size must be positive") - - sample = list(itertools.islice(it, k)) # fill the reservoir - random.shuffle(sample) # if number of items less then *k* then - # return all items in random order. - for i, item in enumerate(it, start=k+1): - j = random.randrange(i) # random [0..i) - if j < k: - sample[j] = item # replace item with gradually decreasing probability - return sample - - def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,test_size=0): records_file = os.path.join('./outputs',audio_group+'.train.tfrecords') input_pairs = [] diff --git a/speech_pitch.py b/speech_pitch.py index 17a8ff8..24008cf 100644 --- a/speech_pitch.py +++ b/speech_pitch.py @@ -144,7 +144,6 @@ def play_sound(samplerate=22050): if __name__ == '__main__': - # mom_snd = pm_snd('outputs/test/moms_are_engineers-7608.aiff') plot_file_pitch('outputs/audio/sunflowers-Victoria-180-normal-870.aiff') plot_file_pitch('outputs/test/a_warm_smile_and_a_good_heart-1917.aiff') play_sound(pm_snd('outputs/test/a_warm_smile_and_a_good_heart-1917.aiff')) diff --git a/speech_tools.py b/speech_tools.py index 1418c09..573eb99 100644 --- a/speech_tools.py +++ b/speech_tools.py @@ -35,6 +35,20 @@ def file_player(): p_oup.terminate() return play_file,close_player +def reservoir_sample(iterable, k): + it = iter(iterable) + if not (k > 0): + raise ValueError("sample size must be positive") + + sample = list(itertools.islice(it, k)) # fill the reservoir + random.shuffle(sample) # if number of items less then *k* then + # return all items in random order. + for i, item in enumerate(it, start=k+1): + j = random.randrange(i) # random [0..i) + if j < k: + sample[j] = item # replace item with gradually decreasing probability + return sample + def record_spectrogram(n_sec, plot=False, playback=False): # show_record_prompt() N_SEC = n_sec