import random import math import pickle from functools import reduce from tqdm import tqdm from sklearn.model_selection import train_test_split import numpy as np import pandas as pd import tensorflow as tf import shutil from speech_pitch import * from speech_tools import reservoir_sample,padd_zeros # import importlib # import speech_tools # importlib.reload(speech_tools) # %matplotlib inline SPEC_MAX_FREQUENCY = 8000 SPEC_WINDOW_SIZE = 0.03 def fix_csv(collection_name = 'test'): seg_data = pd.read_csv('./outputs/segments/'+collection_name+'/index.csv',names=['phrase','filename' ,'start_phoneme','end_phoneme','start_time','end_time']) seg_data.to_csv('./outputs/segments/'+collection_name+'/index.fixed.csv') def pick_random_phrases(collection_name='test'): collection_name = 'test' seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0) phrase_groups = random.sample([i for i in seg_data.groupby(['phrase'])],10) result = [] for ph,g in phrase_groups: result.append(ph) pd.DataFrame(result,columns=['phrase']).to_csv('./outputs/'+collection_name+'.random.csv') # pick_random_phrases() def plot_random_phrases(collection_name = 'test'): # collection_name = 'test' rand_words = pd.read_csv('./outputs/'+collection_name+'.random.csv',index_col=0) rand_w_list = rand_words['phrase'].tolist() seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0) result = (seg_data['phrase'] == rand_w_list[0]) for i in rand_w_list[1:]: result |= (seg_data['phrase'] == i) phrase_groups = [i for i in seg_data[result].groupby(['phrase'])] self_files = ['a_wrong_turn-low1.aiff','great_pin-low1.aiff' ,'he_set_off_at_once_to_find_the_beast-low1.aiff' ,'hound-low1.aiff','noises-low1.aiff','po_burped-low1.aiff' ,'she_loves_the_roses-low1.aiff','the_busy_spider-low1.aiff' ,'the_rain_helped-low1.aiff','to_go_to_the_doctor-low1.aiff'] co_files = map(lambda x: './inputs/self/'+x,self_files) for ((ph,g),s_f) in zip(phrase_groups,co_files): # ph,g = phrase_groups[0] file_path = './outputs/test/'+g.iloc[0]['filename'] phrase_sample = pm_snd(file_path) self_sample = pm_snd(s_f) player,closer = play_sound() # rows = [i for i in g.iterrows()] # random.shuffle(rows) print(ph) phon_stops = [] for (i,phon) in g.iterrows(): end_t = phon['end_time']/1000 phon_ch = phon['start_phoneme'] phon_stops.append((end_t,phon_ch)) plot_sample_pitch(phrase_sample,phons = phon_stops) plot_sample_pitch(self_sample) # player(phrase_sample) # input() # for (i,phon) in g.iterrows(): # # phon = g.iloc[1] # start_t = phon['start_time']/1000 # end_t = phon['end_time']/1000 # phon_ch = phon['start_phoneme'] # phon_sample = phrase_sample.extract_part(from_time=start_t,to_time=end_t) # if phon_sample.n_samples*phon_sample.sampling_period < 6.4/100: # continue # # if phon_ch[0] not in 'AEIOU': # # continue # # phon_sample # # player(phon_sample) # # plot_sample_intensity(phon_sample) # print(phon_ch) # plot_sample_pitch(phon_sample) # closer() def plot_segments(collection_name = 'story_test_segments'): collection_name = 'story_test_segments' seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0) phrase_groups = [i for i in seg_data.groupby(['phrase'])] for (ph,g) in phrase_groups: # ph,g = phrase_groups[0] file_path = './outputs/'+collection_name+'/'+g.iloc[0]['filename'] phrase_sample = pm_snd(file_path) # player,closer = play_sound() print(ph) phon_stops = [] for (i,phon) in g.iterrows(): end_t = phon['end_time']/1000 phon_ch = phon['start_phoneme'] phon_stops.append((end_t,phon_ch)) phrase_spec = phrase_sample.to_spectrogram(window_length=0.03, maximum_frequency=8000) sg_db = 10 * np.log10(phrase_spec.values) result = np.zeros(sg_db.shape[0],dtype=np.int64) ph_bounds = [t[0] for t in phon_stops[1:]] b_frames = np.asarray([spec_frame(phrase_spec,b) for b in ph_bounds]) result[b_frames] = 1 # print(audio) def generate_spec(aiff_file): phrase_sample = pm_snd(aiff_file) phrase_spec = phrase_sample.to_spectrogram(window_length=SPEC_WINDOW_SIZE, maximum_frequency=SPEC_MAX_FREQUENCY) sshow_abs = np.abs(phrase_spec.values + np.finfo(phrase_spec.values.dtype).eps) sg_db = 10 * np.log10(sshow_abs) sg_db[sg_db < 0] = 0 return sg_db,phrase_spec def spec_frame(spec,b): return int(round(spec.frame_number_to_time(b))) def _float_feature(value): return tf.train.Feature(float_list=tf.train.FloatList(value=value)) def _int64_feature(value): return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) def _bytes_feature(value): return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) def create_segments_tfrecords(collection_name='story_test_segments',sample_count=0,train_test_ratio=0.1): audio_samples = pd.read_csv( './outputs/segments/' + collection_name + '/index.fixed.csv',index_col=0) audio_samples['file_path'] = audio_samples.loc[:, 'filename'].apply(lambda x: 'outputs/segments/' + collection_name + '/samples/' + x) n_records,n_spec,n_features = 0,0,0 def write_samples(wg,sample_name): phrase_groups = tqdm(wg,desc='Computing segmentation') record_file = './outputs/segments/{}/{}.tfrecords'.format(collection_name,sample_name) writer = tf.python_io.TFRecordWriter(record_file) for (ph,g) in phrase_groups: fname = g.iloc[0]['filename'] sg_db,phrase_spec = generate_spec(g.iloc[0]['file_path']) phon_stops = [] phrase_groups.set_postfix(phrase=ph) spec_n,spec_w = sg_db.shape spec = sg_db.reshape(-1) for (i,phon) in g.iterrows(): end_t = phon['end_time']/1000 phon_ch = phon['start_phoneme'] phon_stops.append((end_t,phon_ch)) result = np.zeros(spec_n,dtype=np.int64) ph_bounds = [t[0] for t in phon_stops] f_bounds = [spec_frame(phrase_spec,b) for b in ph_bounds] valid_bounds = [i for i in f_bounds if 0 < i < spec_n] b_frames = np.asarray(valid_bounds) if len(b_frames) > 0: result[b_frames] = 1 nonlocal n_records,n_spec,n_features n_spec = max([n_spec,spec_n]) n_features = spec_w n_records+=1 example = tf.train.Example(features=tf.train.Features( feature={ 'phrase': _bytes_feature([ph.encode('utf-8')]), 'file': _bytes_feature([fname.encode('utf-8')]), 'spec':_float_feature(spec), 'spec_n':_int64_feature([spec_n]), 'spec_w':_int64_feature([spec_w]), 'output':_int64_feature(result) } )) writer.write(example.SerializeToString()) phrase_groups.close() writer.close() word_groups = [i for i in audio_samples.groupby('phrase')] wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups # write_samples(word_groups,'all') tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=train_test_ratio) write_samples(tr_audio_samples,'train') write_samples(te_audio_samples,'test') const_file = './outputs/segments/'+collection_name+'/constants.pkl' pickle.dump((n_spec,n_features,n_records),open(const_file,'wb')) def record_generator_count(records_file): record_iterator = tf.python_io.tf_record_iterator(path=records_file) count,spec_n = 0,0 for i in record_iterator: count+=1 record_iterator = tf.python_io.tf_record_iterator(path=records_file) return record_iterator,count def read_segments_tfrecords_generator(collection_name='audio',batch_size=32,test_size=0): # collection_name = 'story_test' records_file = './outputs/segments/'+collection_name+'/train.tfrecords' const_file = './outputs/segments/'+collection_name+'/constants.pkl' (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) def copy_read_consts(dest_dir): shutil.copy2(const_file,dest_dir+'/constants.pkl') return (n_spec,n_features,n_records) # @threadsafe_iter def record_generator(): print('reading tfrecords({}-train)...'.format(collection_name)) input_data = [] output_data = [] while True: record_iterator,records_count = record_generator_count(records_file) for (i,string_record) in enumerate(record_iterator): # (i,string_record) = next(enumerate(record_iterator)) example = tf.train.Example() example.ParseFromString(string_record) spec_n = example.features.feature['spec_n'].int64_list.value[0] spec_w = example.features.feature['spec_w'].int64_list.value[0] spec = np.array(example.features.feature['spec'].float_list.value).reshape(spec_n,spec_w) p_spec = padd_zeros(spec,n_spec) input_data.append(p_spec) output = np.asarray(example.features.feature['output'].int64_list.value) p_output = np.pad(output,(0,n_spec-output.shape[0]),'constant') output_data.append(p_output) if len(input_data) == batch_size or i == n_records-1: input_arr = np.asarray(input_data) output_arr = np.asarray(output_data) input_arr.shape,output_arr.shape yield (input_arr,output_arr) input_data = [] output_data = [] # Read test in one-shot print('reading tfrecords({}-test)...'.format(collection_name)) te_records_file = './outputs/segments/'+collection_name+'/test.tfrecords' te_re_iterator,te_n_records = record_generator_count(te_records_file) # test_size = 10 test_size = min([test_size,te_n_records]) if test_size > 0 else te_n_records input_data = np.zeros((test_size,n_spec,n_features)) output_data = np.zeros((test_size,n_spec)) random_samples = enumerate(reservoir_sample(te_re_iterator,test_size)) for (i,string_record) in tqdm(random_samples,total=test_size): # (i,string_record) = next(random_samples) example = tf.train.Example() example.ParseFromString(string_record) spec_n = example.features.feature['spec_n'].int64_list.value[0] spec_w = example.features.feature['spec_w'].int64_list.value[0] spec = np.array(example.features.feature['spec'].float_list.value).reshape(spec_n,spec_w) p_spec = padd_zeros(spec,n_spec) input_data[i] = p_spec output = np.asarray(example.features.feature['output'].int64_list.value) p_output = np.pad(output,(0,n_spec-output.shape[0]),'constant') output_data[i] = p_output return record_generator,input_data,output_data,copy_read_consts if __name__ == '__main__': # plot_random_phrases() # fix_csv('story_test_segments') # plot_segments('story_test_segments') # fix_csv('story_words') # pass create_segments_tfrecords('story_words.30', sample_count=36,train_test_ratio=0.1) # record_generator,input_data,output_data,copy_read_consts = read_segments_tfrecords_generator('story_test') # tr_gen = record_generator() # for i in tr_gen: # print(i[0].shape,i[1].shape)