diff --git a/segment_data.py b/segment_data.py index 1e94655..72daa74 100644 --- a/segment_data.py +++ b/segment_data.py @@ -1,12 +1,77 @@ import pandas as pd +import numpy as np +import random +from functools import reduce +from speech_pitch import * +# %matplotlib inline def fix_csv(collection_name = 'test'): seg_data = pd.read_csv('./outputs/'+collection_name+'.csv',names=['phrase','filename' ,'start_phoneme','end_phoneme','start_time','end_time']) seg_data.to_csv('./outputs/'+collection_name+'.fixed.csv') - -def segment_data_gen(collection_name = 'test'): +def pick_random_phrases(collection_name='test'): collection_name = 'test' seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0) - + phrase_groups = random.sample([i for i in seg_data.groupby(['phrase'])],10) + result = [] + for ph,g in phrase_groups: + result.append(ph) + pd.DataFrame(result,columns=['phrase']).to_csv('./outputs/'+collection_name+'.random.csv') + +# pick_random_phrases() + +def plot_random_phrases(collection_name = 'test'): + collection_name = 'test' + rand_words = pd.read_csv('./outputs/'+collection_name+'.random.csv',index_col=0) + rand_w_list = rand_words['phrase'].tolist() + seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0) + result = (seg_data['phrase'] == rand_w_list[0]) + for i in rand_w_list[1:]: + result |= (seg_data['phrase'] == i) + # seg_data[result] + phrase_groups = [i for i in seg_data[result].groupby(['phrase'])] + self_files = ['a_wrong_turn-low1.aiff','great_pin-low1.aiff' + ,'he_set_off_at_once_to_find_the_beast-low1.aiff' + ,'hound-low1.aiff','noises-low1.aiff','po_burped-low1.aiff' + ,'she_loves_the_roses-low1.aiff','the_busy_spider-low1.aiff' + ,'the_rain_helped-low1.aiff','to_go_to_the_doctor-low1.aiff'] + co_files = map(lambda x: './inputs/self/'+x,self_files) + + for ((ph,g),s_f) in zip(phrase_groups,co_files): + # ph,g = phrase_groups[0] + file_path = './outputs/test/'+g.iloc[0]['filename'] + phrase_sample = pm_snd(file_path) + self_sample = pm_snd(s_f) + player,closer = play_sound() + # rows = [i for i in g.iterrows()] + # random.shuffle(rows) + print(ph) + phon_stops = [] + for (i,phon) in g.iterrows(): + end_t = phon['end_time']/1000 + phon_ch = phon['start_phoneme'] + phon_stops.append((end_t,phon_ch)) + plot_sample_pitch(phrase_sample,phons = phon_stops) + plot_sample_pitch(self_sample) + # player(phrase_sample) + # input() + # for (i,phon) in g.iterrows(): + # # phon = g.iloc[1] + # start_t = phon['start_time']/1000 + # end_t = phon['end_time']/1000 + # phon_ch = phon['start_phoneme'] + # phon_sample = phrase_sample.extract_part(from_time=start_t,to_time=end_t) + # if phon_sample.n_samples*phon_sample.sampling_period < 6.4/100: + # continue + # # if phon_ch[0] not in 'AEIOU': + # # continue + # # phon_sample + # # player(phon_sample) + # # plot_sample_intensity(phon_sample) + # print(phon_ch) + # plot_sample_pitch(phon_sample) + closer() + # print(phg)#['start_phoneme'],g['start_time']) + +plot_random_phrases() diff --git a/segment_model.py b/segment_model.py index acb4eec..d69be6d 100644 --- a/segment_model.py +++ b/segment_model.py @@ -22,18 +22,30 @@ def accuracy(y_true, y_pred): ''' return K.mean(K.equal(y_true, K.cast(y_pred > 0.5, y_true.dtype))) +def ctc_lambda_func(args): + y_pred, labels, input_length, label_length = args + # the 2 is critical here since the first couple outputs of the RNN + # tend to be garbage: + y_pred = y_pred[:, 2:, :] + return K.ctc_batch_cost(labels, y_pred, input_length, label_length) + def segment_model(input_dim): + input_dim = (100,100,1) inp = Input(shape=input_dim) - # ls0 = LSTM(512, return_sequences=True)(inp) - cnv1 = Conv2D(filters=512, kernel_size=(5,9))(inp) + cnv1 = Conv2D(filters=32, kernel_size=(5,9))(inp) cnv2 = Conv2D(filters=1, kernel_size=(5,9))(cnv1) dr_cnv2 = Dropout(rate=0.95)(cnv2) + # dr_cnv2 cn_rnn_dim = (dr_cnv2.shape[1].value,dr_cnv2.shape[2].value) r_dr_cnv2 = Reshape(target_shape=cn_rnn_dim)(dr_cnv2) - b_gr1 = Bidirectional(GRU(512, return_sequences=True))(r_dr_cnv2) - b_gr2 = Bidirectional(GRU(512, return_sequences=True))(b_gr1) - b_gr3 = Bidirectional(GRU(512))(b_gr2) - return Model(inp, b_gr3) + b_gr1 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(r_dr_cnv2) + # b_gr1 + b_gr2 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(b_gr1) + b_gr3 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(b_gr2) + # b_gr3 + oup = Dense(2, activation='softmax')(b_gr3) + # oup + return Model(inp, oup) def write_model_arch(mod,mod_file): model_f = open(mod_file,'w') @@ -58,7 +70,7 @@ def train_segment(collection_name = 'test'): model = segment_model(input_dim) plot_model(model,show_shapes=True, to_file=model_dir+'/model.png') - + # loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length]) tb_cb = TensorBoard( log_dir=log_dir, histogram_freq=1, @@ -100,5 +112,4 @@ def train_segment(collection_name = 'test'): if __name__ == '__main__': - import pdb; pdb.set_trace() train_segment('test') diff --git a/speech_pitch.py b/speech_pitch.py index 823c17b..17a8ff8 100644 --- a/speech_pitch.py +++ b/speech_pitch.py @@ -3,6 +3,7 @@ from pysndfile import sndio as snd import numpy as np import matplotlib.pyplot as plt import seaborn as sns +import pyaudio as pa sns.set() # Use seaborn's default style to make graphs more pretty @@ -84,8 +85,11 @@ def plot_sample_raw(sample_file='outputs/audio/sunflowers-Victoria-180-normal-87 plt.ylabel("amplitude") plt.show() -def plot_sample_intensity(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'): +def plot_file_intensity(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'): snd_d = pm_snd(sample_file) + plot_sample_intensity(snd_d) + +def plot_sample_intensity(snd_d): intensity = snd_d.to_intensity() spectrogram = snd_d.to_spectrogram() plt.figure() @@ -95,17 +99,41 @@ def plot_sample_intensity(sample_file='outputs/audio/sunflowers-Victoria-180-nor plt.xlim([snd_d.xmin, snd_d.xmax]) plt.show() -def plot_sample_pitch(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'): +def plot_file_pitch(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'): snd_d = pm_snd(sample_file) + plot_sample_pitch(snd_d) + +def plot_sample_pitch(snd_d,phons = []): pitch = snd_d.to_pitch() spectrogram = snd_d.to_spectrogram(window_length=0.03, maximum_frequency=8000) plt.figure() draw_spectrogram(spectrogram) plt.twinx() draw_pitch(pitch) + for (p,c) in phons: + plt.axvline(x=p) + plt.text(p,-1,c) plt.xlim([snd_d.xmin, snd_d.xmax]) plt.show() +def play_sound(samplerate=22050): + #snd_sample = pm_snd('outputs/test/a_warm_smile_and_a_good_heart-1917.aiff') + p_oup = pa.PyAudio() + stream = p_oup.open( + format=pa.paFloat32, + channels=2, + rate=samplerate, + output=True) + def sample_player(snd_sample=None): + samples = snd_sample.as_array()[:,0] + + one_channel = np.asarray([samples, samples]).T.reshape(-1) + audio_data = one_channel.astype(np.float32).tobytes() + stream.write(audio_data) + def close_player(): + stream.close() + p_oup.terminate() + return sample_player,close_player # snd_part = snd_d.extract_part(from_time=0.9, preserve_times=True) # plt.figure() # plt.plot(snd_part.xs(), snd_part.values, linewidth=0.5) @@ -116,8 +144,16 @@ def plot_sample_pitch(sample_file='outputs/audio/sunflowers-Victoria-180-normal- if __name__ == '__main__': - mom_snd = pm_snd('outputs/test/moms_are_engineers-7608.aiff') - plot_sample_pitch('outputs/audio/sunflowers-Victoria-180-normal-870.aiff') - plot_sample_pitch('inputs/self-apple/apple-low1.aiff') - plot_sample_pitch('inputs/self-apple/apple-low2.aiff') - plot_sample_pitch('inputs/self-apple/apple-medium1.aiff') + # mom_snd = pm_snd('outputs/test/moms_are_engineers-7608.aiff') + plot_file_pitch('outputs/audio/sunflowers-Victoria-180-normal-870.aiff') + plot_file_pitch('outputs/test/a_warm_smile_and_a_good_heart-1917.aiff') + play_sound(pm_snd('outputs/test/a_warm_smile_and_a_good_heart-1917.aiff')) + plot_file_pitch('outputs/test/a_wrong_turn-3763.aiff') + play_sound(pm_snd('outputs/test/a_wrong_turn-3763.aiff')) + plot_file_pitch('inputs/self/a_wrong_turn-low1.aiff') + play_sound(pm_snd('inputs/self/a_wrong_turn-low1.aiff')) + plot_file_pitch('inputs/self/a_wrong_turn-low2.aiff') + play_sound(pm_snd('inputs/self/a_wrong_turn-low2.aiff')) + plot_file_pitch('inputs/self/apple-low1.aiff') + plot_file_pitch('inputs/self/apple-low2.aiff') + plot_file_pitch('inputs/self/apple-medium1.aiff') diff --git a/speech_segmentgen.py b/speech_segmentgen.py index ceb5dcf..a09b949 100644 --- a/speech_segmentgen.py +++ b/speech_segmentgen.py @@ -20,7 +20,7 @@ apple_phonemes = [ 'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z' ] -OUTPUT_NAME = 'test' +OUTPUT_NAME = 'story_phrases_segments' dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/' csv_dest_file = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '.csv' @@ -184,7 +184,7 @@ def story_texts(): def generate_audio(): synthQ = SynthesizerQueue() - phrases = random.sample(story_texts(), 100) # story_texts() + phrases = story_texts()#random.sample(story_texts(), 100) # f = open(csv_dest_file, 'w') s_csv_w = csv.writer(f, quoting=csv.QUOTE_MINIMAL) i = 0