import pandas as pd from speech_tools import * from speech_pitch import * # import dask as dd # import dask.dataframe as ddf import tensorflow as tf from tensorflow.python.ops import data_flow_ops import numpy as np from speech_spectrum import generate_aiff_spectrogram,generate_sample_spectrogram from speech_similar import segmentable_phoneme from sklearn.model_selection import train_test_split import os,shutil import random import csv import gc import pickle import itertools from tqdm import tqdm def siamese_pairs(rightGroup, wrongGroup): group1 = [r for (i, r) in rightGroup.iterrows()] group2 = [r for (i, r) in wrongGroup.iterrows()] rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]#+[(g2, g1) for g2 in group2 for g1 in group1] rightRightPairs = [i for i in itertools.combinations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)] def filter_criteria(s1,s2): same = s1['variant'] == s2['variant'] phon_same = s1['phonemes'] == s2['phonemes'] voice_diff = s1['voice'] != s2['voice'] if not same and phon_same: return False # if same and not voice_diff: # return False return True validRWPairs = [i for i in rightWrongPairs if filter_criteria(*i)] validRRPairs = [i for i in rightRightPairs if filter_criteria(*i)] random.shuffle(validRWPairs) random.shuffle(validRRPairs) # return rightRightPairs[:10],rightWrongPairs[:10] return validRRPairs[:32],validRWPairs[:32] def seg_siamese_pairs(rightGroup, wrongGroup): group1 = [r for (i, r) in rightGroup.iterrows()] group2 = [r for (i, r) in wrongGroup.iterrows()] rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]#+[(g2, g1) for g2 in group2 for g1 in group1] rightRightPairs = [i for i in itertools.combinations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)] def filter_criteria(s1,s2): same = s1['variant'] == s2['variant'] phon_same = s1['phonemes'] == s2['phonemes'] voice_diff = s1['voice'] != s2['voice'] if not same and phon_same: return False # if same and not voice_diff: # return False return True validRWPairs = [i for i in rightWrongPairs if filter_criteria(*i)] validRRPairs = [i for i in rightRightPairs if filter_criteria(*i)] random.shuffle(validRWPairs) random.shuffle(validRRPairs) rrPhonePairs = [] rwPhonePairs = [] def compute_seg_spec(s1,s2): phon_count = len(s1['parsed_phoneme']) seg1_count = len(s1['segments'].index) seg2_count = len(s2['segments'].index) if phon_count == seg1_count and seg2_count == phon_count: s1nd,s2nd = pm_snd(s1['file_path']),pm_snd(s2['file_path']) segs1 = [tuple(x) for x in s1['segments'][['start','end']].values] segs2 = [tuple(x) for x in s2['segments'][['start','end']].values] s1_cp = pd.Series(s1) s2_cp = pd.Series(s2) pp12 = zip(s1['parsed_phoneme'],s2['parsed_phoneme'],segs1,segs2) for (p1,p2,(s1s,s1e),(s2s,s2e)) in pp12: spc1 = generate_sample_spectrogram(s1nd.extract_part(s1s,s1e).values) spc2 = generate_sample_spectrogram(s2nd.extract_part(s2s,s2e).values) s1_cp['spectrogram'] = spc1 s2_cp['spectrogram'] = spc2 # import pdb; pdb.set_trace() if repr(p1) == repr(p2): rrPhonePairs.append((s1_cp,s2_cp)) else: rwPhonePairs.append((s1_cp,s2_cp)) for (s1,s2) in validRRPairs: compute_seg_spec(s1,s2) for (s1,s2) in validRWPairs: compute_seg_spec(s1,s2) return rrPhonePairs[:32],rwPhonePairs[:32] # return rightRightPairs[:10],rightWrongPairs[:10] # return # validRRPairs[:8],validRWPairs[:8] def _float_feature(value): return tf.train.Feature(float_list=tf.train.FloatList(value=value)) def _int64_feature(value): return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) def _bytes_feature(value): return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) def create_spectrogram_tfrecords(audio_group='audio',sample_count=0,train_test_ratio=0.1): ''' http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/ http://www.machinelearninguru.com/deep_learning/tensorflow/basics/tfrecord/tfrecord.html ''' audio_samples = pd.read_csv( './outputs/' + audio_group + '.fixed.csv',index_col=0) audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) n_records,n_spec,n_features = 0,0,0 def write_samples(wg,sample_name): word_group_prog = tqdm(wg,desc='Computing spectrogram') record_file = './outputs/{}.{}.tfrecords'.format(audio_group,sample_name) writer = tf.python_io.TFRecordWriter(record_file) for (w, word_group) in word_group_prog: word_group_prog.set_postfix(word=w,sample_name=sample_name) g = word_group.reset_index() # g['spectrogram'] = apply_by_multiprocessing(g['file_path'],pitch_array) g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) # g['spectrogram'] = apply_by_multiprocessing(g['file_path'],compute_mfcc) sample_right = g.loc[g['variant'] == 'low'] sample_wrong = g.loc[g['variant'] == 'medium'] same, diff = siamese_pairs(sample_right, sample_wrong) groups = [([0,1],same),([1,0],diff)] for (output,group) in groups: group_prog = tqdm(group,desc='Writing Spectrogram') for sample1,sample2 in group_prog: group_prog.set_postfix(output=output ,var1=sample1['variant'] ,var2=sample2['variant']) spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram'] spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0] spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1] spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1) nonlocal n_spec,n_records,n_features n_spec = max([n_spec,spec_n1,spec_n2]) n_features = spec_w1 n_records+=1 example = tf.train.Example(features=tf.train.Features( feature={ 'word': _bytes_feature([w.encode('utf-8')]), 'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]), 'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]), 'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]), 'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]), 'language': _bytes_feature([sample1['language'].encode('utf-8')]), 'rate1':_int64_feature([sample1['rate']]), 'rate2':_int64_feature([sample2['rate']]), 'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]), 'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]), 'file1': _bytes_feature([sample1['file'].encode('utf-8')]), 'file2': _bytes_feature([sample2['file'].encode('utf-8')]), 'spec1':_float_feature(spec1), 'spec2':_float_feature(spec2), 'spec_n1':_int64_feature([spec_n1]), 'spec_w1':_int64_feature([spec_w1]), 'spec_n2':_int64_feature([spec_n2]), 'spec_w2':_int64_feature([spec_w2]), 'output':_int64_feature(output) } )) writer.write(example.SerializeToString()) group_prog.close() word_group_prog.close() writer.close() word_groups = [i for i in audio_samples.groupby('word')] wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=train_test_ratio) write_samples(tr_audio_samples,'train') write_samples(te_audio_samples,'test') const_file = os.path.join('./outputs',audio_group+'.constants') pickle.dump((n_spec,n_features,n_records),open(const_file,'wb')) def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,test_size=0): records_file = os.path.join('./outputs',audio_group+'.train.tfrecords') input_pairs = [] output_class = [] const_file = os.path.join('./outputs',audio_group+'.constants') (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) def copy_read_consts(dest_dir): shutil.copy2(const_file,dest_dir+'/constants.pkl') return (n_spec,n_features,n_records) # @threadsafe_iter def record_generator(): print('reading tfrecords({}-train)...'.format(audio_group)) input_data = [] output_data = [] while True: record_iterator,records_count = record_generator_count(records_file) #tqdm(enumerate(record_iterator),total=records_count) #enumerate(record_iterator) for (i,string_record) in enumerate(record_iterator): example = tf.train.Example() example.ParseFromString(string_record) spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) input_data.append(np.asarray([p_spec1,p_spec2])) output = example.features.feature['output'].int64_list.value output_data.append(np.asarray(output)) if len(input_data) == batch_size or i == n_records-1: input_arr = np.asarray(input_data) output_arr = np.asarray(output_data) yield ([input_arr[:, 0], input_arr[:, 1]],output_arr) input_data = [] output_data = [] # Read test in one-shot print('reading tfrecords({}-test)...'.format(audio_group)) te_records_file = os.path.join('./outputs',audio_group+'.test.tfrecords') te_re_iterator,te_n_records = record_generator_count(te_records_file) test_size = min([test_size,te_n_records]) if test_size > 0 else te_n_records input_data = np.zeros((test_size,2,n_spec,n_features)) output_data = np.zeros((test_size,2)) random_samples = enumerate(reservoir_sample(te_re_iterator,test_size)) for (i,string_record) in tqdm(random_samples,total=test_size): example = tf.train.Example() example.ParseFromString(string_record) spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) input_data[i] = np.asarray([p_spec1,p_spec2]) output = example.features.feature['output'].int64_list.value output_data[i] = np.asarray(output) return record_generator,input_data,output_data,copy_read_consts def audio_samples_word_count(audio_group='audio'): audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv') return len(audio_samples.groupby(audio_samples['word'])) def record_generator_count(records_file): record_iterator = tf.python_io.tf_record_iterator(path=records_file) count,spec_n = 0,0 for i in record_iterator: # example = tf.train.Example() # example.ParseFromString(i) # spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] # spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] # spec_n = max([spec_n,spec_n1,spec_n2]) count+=1 record_iterator = tf.python_io.tf_record_iterator(path=records_file) return record_iterator,count #,spec_n def fix_csv(audio_group='audio'): audio_csv_lines = open('./outputs/' + audio_group + '.csv','r').readlines() audio_csv_data = [i.strip().split(',') for i in audio_csv_lines] proper_rows = [i for i in audio_csv_data if len(i) == 7] with open('./outputs/' + audio_group + '.fixed.csv','w') as fixed_csv: fixed_csv_w = csv.writer(fixed_csv, quoting=csv.QUOTE_MINIMAL) fixed_csv_w.writerows(proper_rows) audio_samples = pd.read_csv( './outputs/' + audio_group + '.fixed.csv' , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']) audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists) audio_samples = audio_samples[audio_samples['file_exists'] == True] audio_samples = audio_samples.drop(['file_path','file_exists'],axis=1).reset_index(drop=True) audio_samples.to_csv('./outputs/' + audio_group + '.fixed.csv') def convert_old_audio(): audio_samples = pd.read_csv( './outputs/audio.csv.old' , names=['word', 'voice', 'rate', 'variant', 'file']) audio_samples['phonemes'] = 'unknown' audio_samples['language'] = 'en-US' audio_samples.loc[audio_samples['variant'] == 'normal','variant'] = 'low' audio_samples.loc[audio_samples['variant'] == 'phoneme','variant'] = 'medium' audio_samples = audio_samples[['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']] audio_samples.to_csv('./outputs/audio_new.csv',index=False,header=False) def generate_sppas_trans(audio_group='story_words.all'): # audio_group='story_words.all' audio_samples = pd.read_csv( './outputs/' + audio_group + '.fixed.csv',index_col=0) audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) # audio_samples = audio_samples.head(5) rows = tqdm(audio_samples.iterrows(),total = len(audio_samples.index) , desc='Transcribing Words ') for (i,row) in rows: # len(audio_samples.iterrows()) # (i,row) = next(audio_samples.iterrows()) rows.set_postfix(word=row['word']) transribe_audio_text(row['file_path'],row['word']) rows.close() def create_seg_phonpair_tfrecords(audio_group='story_words.all',sample_count=0,train_test_ratio=0.1): audio_samples = pd.read_csv( './outputs/' + audio_group + '.fixed.csv',index_col=0) audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) audio_samples = audio_samples[(audio_samples['variant'] == 'low') | (audio_samples['variant'] == 'medium')] audio_samples['parsed_phoneme'] = apply_by_multiprocessing(audio_samples['phonemes'],segmentable_phoneme) # audio_samples['sound'] = apply_by_multiprocessing(audio_samples['file_path'],pm_snd) # read_seg_file(audio_samples.iloc[0]['file_path']) audio_samples['segments'] = apply_by_multiprocessing(audio_samples['file_path'],read_seg_file) n_records,n_spec,n_features = 0,0,0 def write_samples(wg,sample_name): word_group_prog = tqdm(wg,desc='Computing PhonPair spectrogram') record_file = './outputs/{}.{}.tfrecords'.format(audio_group,sample_name) writer = tf.python_io.TFRecordWriter(record_file) for (w, word_group) in word_group_prog: word_group_prog.set_postfix(word=w,sample_name=sample_name) g = word_group.reset_index() # g['spectrogram'] = apply_by_multiprocessing(g['file_path'],pitch_array) # g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) # g['spectrogram'] = apply_by_multiprocessing(g['file_path'],compute_mfcc) sample_right = g.loc[g['variant'] == 'low'] sample_wrong = g.loc[g['variant'] == 'medium'] same, diff = seg_siamese_pairs(sample_right, sample_wrong) groups = [([0,1],same),([1,0],diff)] for (output,group) in groups: group_prog = tqdm(group,desc='Writing Spectrogram') for sample1,sample2 in group_prog: group_prog.set_postfix(output=output ,var1=sample1['variant'] ,var2=sample2['variant']) spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram'] spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0] spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1] spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1) nonlocal n_spec,n_records,n_features n_spec = max([n_spec,spec_n1,spec_n2]) n_features = spec_w1 n_records+=1 example = tf.train.Example(features=tf.train.Features( feature={ 'word': _bytes_feature([w.encode('utf-8')]), 'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]), 'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]), 'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]), 'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]), 'language': _bytes_feature([sample1['language'].encode('utf-8')]), 'rate1':_int64_feature([sample1['rate']]), 'rate2':_int64_feature([sample2['rate']]), 'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]), 'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]), 'file1': _bytes_feature([sample1['file'].encode('utf-8')]), 'file2': _bytes_feature([sample2['file'].encode('utf-8')]), 'spec1':_float_feature(spec1), 'spec2':_float_feature(spec2), 'spec_n1':_int64_feature([spec_n1]), 'spec_w1':_int64_feature([spec_w1]), 'spec_n2':_int64_feature([spec_n2]), 'spec_w2':_int64_feature([spec_w2]), 'output':_int64_feature(output) } )) writer.write(example.SerializeToString()) group_prog.close() word_group_prog.close() writer.close() word_groups = [i for i in audio_samples.groupby('word')] wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=train_test_ratio) write_samples(tr_audio_samples,'train') write_samples(te_audio_samples,'test') const_file = os.path.join('./outputs',audio_group+'.constants') pickle.dump((n_spec,n_features,n_records),open(const_file,'wb')) if __name__ == '__main__': # sunflower_pairs_data() # create_spectrogram_data() # create_spectrogram_data('story_words') # create_spectrogram_tfrecords('story_words') # create_spectrogram_tfrecords('story_words_test') # read_siamese_tfrecords('story_all') # read_siamese_tfrecords('story_words_test') # padd_zeros_siamese_tfrecords('story_words') # fix_csv('story_words') # pickle_constants('story_words') # create_spectrogram_tfrecords('audio',sample_count=100) # create_spectrogram_tfrecords('story_all',sample_count=25) # fix_csv('story_words_test') # fix_csv('test_5_words') # generate_sppas_trans('test_5_words') create_seg_phonpair_tfrecords('test_5_words') # create_spectrogram_tfrecords('story_words.all',sample_count=0,train_test_ratio=0.1) #record_generator_count() # create_spectrogram_tfrecords('audio',sample_count=50) # read_siamese_tfrecords_generator('audio') # padd_zeros_siamese_tfrecords('audio') # create_padded_spectrogram() # create_speech_pairs_data() # print(speech_model_data())