import pandas as pd from speech_tools import apply_by_multiprocessing,threadsafe_iter # import dask as dd # import dask.dataframe as ddf import tensorflow as tf from tensorflow.python.ops import data_flow_ops import numpy as np from speech_spectrum import generate_aiff_spectrogram from speech_pitch import compute_mfcc from sklearn.model_selection import train_test_split import itertools import os import random import csv import gc import pickle from tqdm import tqdm def siamese_pairs(rightGroup, wrongGroup): group1 = [r for (i, r) in rightGroup.iterrows()] group2 = [r for (i, r) in wrongGroup.iterrows()] rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]+[(g2, g1) for g2 in group2 for g1 in group1] rightRightPairs = [i for i in itertools.permutations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)] def filter_criteria(s1,s2): same = s1['variant'] == s2['variant'] phon_same = s1['phonemes'] == s2['phonemes'] voice_diff = s1['voice'] != s2['voice'] if not same and phon_same: return False if same and not voice_diff: return False return True validRWPairs = [i for i in rightWrongPairs if filter_criteria(*i)] validRRPairs = [i for i in rightRightPairs if filter_criteria(*i)] random.shuffle(validRWPairs) random.shuffle(validRRPairs) # return rightRightPairs[:10],rightWrongPairs[:10] return validRWPairs[:32],validRRPairs[:32] def _float_feature(value): return tf.train.Feature(float_list=tf.train.FloatList(value=value)) def _int64_feature(value): return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) def _bytes_feature(value): return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) def create_spectrogram_tfrecords(audio_group='audio',sample_count=0,train_test_ratio=0.1): ''' http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/ http://www.machinelearninguru.com/deep_learning/tensorflow/basics/tfrecord/tfrecord.html ''' audio_samples = pd.read_csv( './outputs/' + audio_group + '.fixed.csv',index_col=0) audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) n_records,n_spec,n_features = 0,0,0 def write_samples(wg,sample_name): word_group_prog = tqdm(wg,desc='Computing spectrogram') record_file = './outputs/{}.{}.tfrecords'.format(audio_group,sample_name) writer = tf.python_io.TFRecordWriter(record_file) for (w, word_group) in word_group_prog: word_group_prog.set_postfix(word=w,sample_name=sample_name) g = word_group.reset_index() # g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) g['spectrogram'] = apply_by_multiprocessing(g['file_path'],compute_mfcc) sample_right = g.loc[g['variant'] == 'low'] sample_wrong = g.loc[g['variant'] == 'medium'] same, diff = siamese_pairs(sample_right, sample_wrong) groups = [([0,1],same),([1,0],diff)] for (output,group) in groups: group_prog = tqdm(group,desc='Writing Spectrogram') for sample1,sample2 in group_prog: group_prog.set_postfix(output=output ,var1=sample1['variant'] ,var2=sample2['variant']) spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram'] spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0] spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1] spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1) nonlocal n_spec,n_records,n_features n_spec = max([n_spec,spec_n1,spec_n2]) n_features = spec_w1 n_records+=1 example = tf.train.Example(features=tf.train.Features( feature={ 'word': _bytes_feature([w.encode('utf-8')]), 'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]), 'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]), 'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]), 'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]), 'language': _bytes_feature([sample1['language'].encode('utf-8')]), 'rate1':_int64_feature([sample1['rate']]), 'rate2':_int64_feature([sample2['rate']]), 'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]), 'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]), 'file1': _bytes_feature([sample1['file'].encode('utf-8')]), 'file2': _bytes_feature([sample2['file'].encode('utf-8')]), 'spec1':_float_feature(spec1), 'spec2':_float_feature(spec2), 'spec_n1':_int64_feature([spec_n1]), 'spec_w1':_int64_feature([spec_w1]), 'spec_n2':_int64_feature([spec_n2]), 'spec_w2':_int64_feature([spec_w2]), 'output':_int64_feature(output) } )) writer.write(example.SerializeToString()) group_prog.close() word_group_prog.close() writer.close() word_groups = [i for i in audio_samples.groupby('word')] wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=train_test_ratio) write_samples(tr_audio_samples,'train') write_samples(te_audio_samples,'test') const_file = os.path.join('./outputs',audio_group+'.constants') pickle.dump((n_spec,n_features,n_records),open(const_file,'wb')) def padd_zeros(spgr, max_samples): return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], 'constant') def reservoir_sample(iterable, k): it = iter(iterable) if not (k > 0): raise ValueError("sample size must be positive") sample = list(itertools.islice(it, k)) # fill the reservoir random.shuffle(sample) # if number of items less then *k* then # return all items in random order. for i, item in enumerate(it, start=k+1): j = random.randrange(i) # random [0..i) if j < k: sample[j] = item # replace item with gradually decreasing probability return sample def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,test_size=0): records_file = os.path.join('./outputs',audio_group+'.train.tfrecords') input_pairs = [] output_class = [] const_file = os.path.join('./outputs',audio_group+'.constants') (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb')) print('reading tfrecords({}-train)...'.format(audio_group)) # @threadsafe_iter def record_generator(): input_data = [] output_data = [] while True: record_iterator = tf.python_io.tf_record_iterator(path=records_file) #tqdm(enumerate(record_iterator),total=n_records) for (i,string_record) in enumerate(record_iterator): example = tf.train.Example() example.ParseFromString(string_record) spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) input_data.append(np.asarray([p_spec1,p_spec2])) output = example.features.feature['output'].int64_list.value output_data.append(np.asarray(output)) if len(input_data) == batch_size or i == n_records-1: input_arr = np.asarray(input_data) output_arr = np.asarray(output_data) yield ([input_arr[:, 0], input_arr[:, 1]],output_arr) input_data = [] output_data = [] # Read test in one-shot te_records_file = os.path.join('./outputs',audio_group+'.test.tfrecords') te_re_iterator = tf.python_io.tf_record_iterator(path=records_file) te_n_records = len([i for i in te_re_iterator]) te_re_iterator = tf.python_io.tf_record_iterator(path=records_file) print('reading tfrecords({}-test)...'.format(audio_group)) test_size = min([test_size,te_n_records]) if test_size > 0 else te_n_records input_data = np.zeros((test_size,2,n_spec,n_features)) output_data = np.zeros((test_size,2)) random_samples = enumerate(reservoir_sample(te_re_iterator,test_size)) for (i,string_record) in tqdm(random_samples,total=test_size): example = tf.train.Example() example.ParseFromString(string_record) spec_n1 = example.features.feature['spec_n1'].int64_list.value[0] spec_n2 = example.features.feature['spec_n2'].int64_list.value[0] spec_w1 = example.features.feature['spec_w1'].int64_list.value[0] spec_w2 = example.features.feature['spec_w2'].int64_list.value[0] spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1) spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2) p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec) input_data[i] = np.asarray([p_spec1,p_spec2]) output = example.features.feature['output'].int64_list.value output_data[i] = np.asarray(output) return record_generator,input_data,output_data,n_spec,n_features,n_records def audio_samples_word_count(audio_group='audio'): audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv') return len(audio_samples.groupby(audio_samples['word'])) def record_generator_count(records_file): record_iterator = tf.python_io.tf_record_iterator(path=records_file) count = len([i for i in record_iterator]) record_iterator = tf.python_io.tf_record_iterator(path=records_file) return record_iterator,count def fix_csv(audio_group='audio'): audio_csv_lines = open('./outputs/' + audio_group + '.csv','r').readlines() audio_csv_data = [i.strip().split(',') for i in audio_csv_lines] proper_rows = [i for i in audio_csv_data if len(i) == 7] with open('./outputs/' + audio_group + '.fixed.csv','w') as fixed_csv: fixed_csv_w = csv.writer(fixed_csv, quoting=csv.QUOTE_MINIMAL) fixed_csv_w.writerows(proper_rows) audio_samples = pd.read_csv( './outputs/' + audio_group + '.fixed.csv' , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']) audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists) audio_samples = audio_samples[audio_samples['file_exists'] == True] audio_samples = audio_samples.drop(['file_path','file_exists'],axis=1).reset_index(drop=True) audio_samples.to_csv('./outputs/' + audio_group + '.fixed.csv') def convert_old_audio(): audio_samples = pd.read_csv( './outputs/audio.csv.old' , names=['word', 'voice', 'rate', 'variant', 'file']) audio_samples['phonemes'] = 'unknown' audio_samples['language'] = 'en-US' audio_samples.loc[audio_samples['variant'] == 'normal','variant'] = 'low' audio_samples.loc[audio_samples['variant'] == 'phoneme','variant'] = 'medium' audio_samples = audio_samples[['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']] audio_samples.to_csv('./outputs/audio_new.csv',index=False,header=False) if __name__ == '__main__': # sunflower_pairs_data() # create_spectrogram_data() # create_spectrogram_data('story_words') # create_spectrogram_tfrecords('story_words') # create_spectrogram_tfrecords('story_words_test') # read_siamese_tfrecords('story_all') # read_siamese_tfrecords('story_words_test') # padd_zeros_siamese_tfrecords('story_words') # fix_csv('story_words') # pickle_constants('story_words') # create_spectrogram_tfrecords('audio',sample_count=100) # create_spectrogram_tfrecords('story_all',sample_count=25) # fix_csv('story_words_test') #fix_csv('story_phrases') create_spectrogram_tfrecords('story_phrases',sample_count=10,train_test_ratio=0.1) # create_spectrogram_tfrecords('audio',sample_count=50) # read_siamese_tfrecords_generator('audio') # padd_zeros_siamese_tfrecords('audio') # create_padded_spectrogram() # create_speech_pairs_data() # print(speech_model_data())