import random import math import pickle from functools import reduce from tqdm import tqdm from sklearn.model_selection import train_test_split import numpy as np import pandas as pd import tensorflow as tf from speech_pitch import * from speech_tools import reservoir_sample # %matplotlib inline SPEC_MAX_FREQUENCY = 8000 SPEC_WINDOW_SIZE = 0.03 def fix_csv(collection_name = 'test'): seg_data = pd.read_csv('./outputs/segments/'+collection_name+'/index.csv',names=['phrase','filename' ,'start_phoneme','end_phoneme','start_time','end_time']) seg_data.to_csv('./outputs/segments/'+collection_name+'/index.fixed.csv') def pick_random_phrases(collection_name='test'): collection_name = 'test' seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0) phrase_groups = random.sample([i for i in seg_data.groupby(['phrase'])],10) result = [] for ph,g in phrase_groups: result.append(ph) pd.DataFrame(result,columns=['phrase']).to_csv('./outputs/'+collection_name+'.random.csv') # pick_random_phrases() def plot_random_phrases(collection_name = 'test'): # collection_name = 'test' rand_words = pd.read_csv('./outputs/'+collection_name+'.random.csv',index_col=0) rand_w_list = rand_words['phrase'].tolist() seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0) result = (seg_data['phrase'] == rand_w_list[0]) for i in rand_w_list[1:]: result |= (seg_data['phrase'] == i) phrase_groups = [i for i in seg_data[result].groupby(['phrase'])] self_files = ['a_wrong_turn-low1.aiff','great_pin-low1.aiff' ,'he_set_off_at_once_to_find_the_beast-low1.aiff' ,'hound-low1.aiff','noises-low1.aiff','po_burped-low1.aiff' ,'she_loves_the_roses-low1.aiff','the_busy_spider-low1.aiff' ,'the_rain_helped-low1.aiff','to_go_to_the_doctor-low1.aiff'] co_files = map(lambda x: './inputs/self/'+x,self_files) for ((ph,g),s_f) in zip(phrase_groups,co_files): # ph,g = phrase_groups[0] file_path = './outputs/test/'+g.iloc[0]['filename'] phrase_sample = pm_snd(file_path) self_sample = pm_snd(s_f) player,closer = play_sound() # rows = [i for i in g.iterrows()] # random.shuffle(rows) print(ph) phon_stops = [] for (i,phon) in g.iterrows(): end_t = phon['end_time']/1000 phon_ch = phon['start_phoneme'] phon_stops.append((end_t,phon_ch)) plot_sample_pitch(phrase_sample,phons = phon_stops) plot_sample_pitch(self_sample) # player(phrase_sample) # input() # for (i,phon) in g.iterrows(): # # phon = g.iloc[1] # start_t = phon['start_time']/1000 # end_t = phon['end_time']/1000 # phon_ch = phon['start_phoneme'] # phon_sample = phrase_sample.extract_part(from_time=start_t,to_time=end_t) # if phon_sample.n_samples*phon_sample.sampling_period < 6.4/100: # continue # # if phon_ch[0] not in 'AEIOU': # # continue # # phon_sample # # player(phon_sample) # # plot_sample_intensity(phon_sample) # print(phon_ch) # plot_sample_pitch(phon_sample) # closer() def plot_segments(collection_name = 'story_test_segments'): collection_name = 'story_test_segments' seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0) phrase_groups = [i for i in seg_data.groupby(['phrase'])] for (ph,g) in phrase_groups: # ph,g = phrase_groups[0] file_path = './outputs/'+collection_name+'/'+g.iloc[0]['filename'] phrase_sample = pm_snd(file_path) # player,closer = play_sound() print(ph) phon_stops = [] for (i,phon) in g.iterrows(): end_t = phon['end_time']/1000 phon_ch = phon['start_phoneme'] phon_stops.append((end_t,phon_ch)) phrase_spec = phrase_sample.to_spectrogram(window_length=0.03, maximum_frequency=8000) sg_db = 10 * np.log10(phrase_spec.values) result = np.zeros(sg_db.shape[0],dtype=np.int32) ph_bounds = [t[0] for t in phon_stops[1:]] b_frames = np.asarray([spec_frame(phrase_spec,b) for b in ph_bounds]) result[b_frames] = 1 # print(audio) def generate_spec(aiff_file): phrase_sample = pm_snd(aiff_file) phrase_spec = phrase_sample.to_spectrogram(window_length=SPEC_WINDOW_SIZE, maximum_frequency=SPEC_MAX_FREQUENCY) sg_db = 10 * np.log10(phrase_spec.values) return sg_db,phrase_spec def spec_frame(spec,b): return int(round(spec.frame_number_to_time(b))) def _float_feature(value): return tf.train.Feature(float_list=tf.train.FloatList(value=value)) def _int64_feature(value): return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) def _bytes_feature(value): return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) def create_segments_tfrecords(collection_name='story_test_segments',sample_count=0,train_test_ratio=0.1): audio_samples = pd.read_csv( './outputs/segments/' + collection_name + '/index.fixed.csv',index_col=0) audio_samples['file_path'] = audio_samples.loc[:, 'filename'].apply(lambda x: 'outputs/segments/' + collection_name + '/samples/' + x) n_records,n_spec,n_features = 0,0,0 def write_samples(wg,sample_name): phrase_groups = tqdm(wg,desc='Computing segmentation') record_file = './outputs/segments/{}/{}.tfrecords'.format(collection_name,sample_name) writer = tf.python_io.TFRecordWriter(record_file) for (ph,g) in phrase_groups: fname = g.iloc[0]['filename'] sg_db,phrase_spec = generate_spec(g.iloc[0]['file_path']) phon_stops = [] spec_n,spec_w = sg_db.shape spec = sg_db.reshape(-1) for (i,phon) in g.iterrows(): end_t = phon['end_time']/1000 phon_ch = phon['start_phoneme'] phon_stops.append((end_t,phon_ch)) result = np.zeros(spec_n,dtype=np.int32) ph_bounds = [t[0] for t in phon_stops] f_bounds = [spec_frame(phrase_spec,b) for b in ph_bounds] valid_bounds = [i for i in f_bounds if 0 < i < spec_n] b_frames = np.asarray(valid_bounds) # print(spec_n,b_frames) result[b_frames] = 1 nonlocal n_records,n_spec,n_features n_spec = max([n_spec,spec_n]) n_features = spec_w n_records+=1 example = tf.train.Example(features=tf.train.Features( feature={ 'phrase': _bytes_feature([ph.encode('utf-8')]), 'file': _bytes_feature([fname.encode('utf-8')]), 'spec':_float_feature(spec), 'spec_n1':_int64_feature([spec_n]), 'spec_w1':_int64_feature([spec_w]), 'output':_int64_feature(result) } )) writer.write(example.SerializeToString()) phrase_groups.close() writer.close() word_groups = [i for i in audio_samples.groupby('phrase')] wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=train_test_ratio) write_samples(tr_audio_samples,'train') write_samples(te_audio_samples,'test') const_file = './outputs/segments/'+collection_name+'/constants.pkl' pickle.dump((n_spec,n_features,n_records),open(const_file,'wb')) if __name__ == '__main__': # plot_random_phrases() # fix_csv('story_test_segments') # plot_segments('story_test_segments') # fix_csv('story_test') create_segments_tfrecords('story_test')