implemented segmentation visualization

master
Malar Kannan 2017-11-30 14:49:55 +05:30
parent 0b1152b5c3
commit 6ef4e86f41
4 changed files with 132 additions and 20 deletions

View File

@ -1,12 +1,77 @@
import pandas as pd
import numpy as np
import random
from functools import reduce
from speech_pitch import *
# %matplotlib inline
def fix_csv(collection_name = 'test'):
seg_data = pd.read_csv('./outputs/'+collection_name+'.csv',names=['phrase','filename'
,'start_phoneme','end_phoneme','start_time','end_time'])
seg_data.to_csv('./outputs/'+collection_name+'.fixed.csv')
def segment_data_gen(collection_name = 'test'):
def pick_random_phrases(collection_name='test'):
collection_name = 'test'
seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0)
phrase_groups = random.sample([i for i in seg_data.groupby(['phrase'])],10)
result = []
for ph,g in phrase_groups:
result.append(ph)
pd.DataFrame(result,columns=['phrase']).to_csv('./outputs/'+collection_name+'.random.csv')
# pick_random_phrases()
def plot_random_phrases(collection_name = 'test'):
collection_name = 'test'
rand_words = pd.read_csv('./outputs/'+collection_name+'.random.csv',index_col=0)
rand_w_list = rand_words['phrase'].tolist()
seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0)
result = (seg_data['phrase'] == rand_w_list[0])
for i in rand_w_list[1:]:
result |= (seg_data['phrase'] == i)
# seg_data[result]
phrase_groups = [i for i in seg_data[result].groupby(['phrase'])]
self_files = ['a_wrong_turn-low1.aiff','great_pin-low1.aiff'
,'he_set_off_at_once_to_find_the_beast-low1.aiff'
,'hound-low1.aiff','noises-low1.aiff','po_burped-low1.aiff'
,'she_loves_the_roses-low1.aiff','the_busy_spider-low1.aiff'
,'the_rain_helped-low1.aiff','to_go_to_the_doctor-low1.aiff']
co_files = map(lambda x: './inputs/self/'+x,self_files)
for ((ph,g),s_f) in zip(phrase_groups,co_files):
# ph,g = phrase_groups[0]
file_path = './outputs/test/'+g.iloc[0]['filename']
phrase_sample = pm_snd(file_path)
self_sample = pm_snd(s_f)
player,closer = play_sound()
# rows = [i for i in g.iterrows()]
# random.shuffle(rows)
print(ph)
phon_stops = []
for (i,phon) in g.iterrows():
end_t = phon['end_time']/1000
phon_ch = phon['start_phoneme']
phon_stops.append((end_t,phon_ch))
plot_sample_pitch(phrase_sample,phons = phon_stops)
plot_sample_pitch(self_sample)
# player(phrase_sample)
# input()
# for (i,phon) in g.iterrows():
# # phon = g.iloc[1]
# start_t = phon['start_time']/1000
# end_t = phon['end_time']/1000
# phon_ch = phon['start_phoneme']
# phon_sample = phrase_sample.extract_part(from_time=start_t,to_time=end_t)
# if phon_sample.n_samples*phon_sample.sampling_period < 6.4/100:
# continue
# # if phon_ch[0] not in 'AEIOU':
# # continue
# # phon_sample
# # player(phon_sample)
# # plot_sample_intensity(phon_sample)
# print(phon_ch)
# plot_sample_pitch(phon_sample)
closer()
# print(phg)#['start_phoneme'],g['start_time'])
plot_random_phrases()

View File

@ -22,18 +22,30 @@ def accuracy(y_true, y_pred):
'''
return K.mean(K.equal(y_true, K.cast(y_pred > 0.5, y_true.dtype)))
def ctc_lambda_func(args):
y_pred, labels, input_length, label_length = args
# the 2 is critical here since the first couple outputs of the RNN
# tend to be garbage:
y_pred = y_pred[:, 2:, :]
return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
def segment_model(input_dim):
input_dim = (100,100,1)
inp = Input(shape=input_dim)
# ls0 = LSTM(512, return_sequences=True)(inp)
cnv1 = Conv2D(filters=512, kernel_size=(5,9))(inp)
cnv1 = Conv2D(filters=32, kernel_size=(5,9))(inp)
cnv2 = Conv2D(filters=1, kernel_size=(5,9))(cnv1)
dr_cnv2 = Dropout(rate=0.95)(cnv2)
# dr_cnv2
cn_rnn_dim = (dr_cnv2.shape[1].value,dr_cnv2.shape[2].value)
r_dr_cnv2 = Reshape(target_shape=cn_rnn_dim)(dr_cnv2)
b_gr1 = Bidirectional(GRU(512, return_sequences=True))(r_dr_cnv2)
b_gr2 = Bidirectional(GRU(512, return_sequences=True))(b_gr1)
b_gr3 = Bidirectional(GRU(512))(b_gr2)
return Model(inp, b_gr3)
b_gr1 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(r_dr_cnv2)
# b_gr1
b_gr2 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(b_gr1)
b_gr3 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(b_gr2)
# b_gr3
oup = Dense(2, activation='softmax')(b_gr3)
# oup
return Model(inp, oup)
def write_model_arch(mod,mod_file):
model_f = open(mod_file,'w')
@ -58,7 +70,7 @@ def train_segment(collection_name = 'test'):
model = segment_model(input_dim)
plot_model(model,show_shapes=True, to_file=model_dir+'/model.png')
# loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])
tb_cb = TensorBoard(
log_dir=log_dir,
histogram_freq=1,
@ -100,5 +112,4 @@ def train_segment(collection_name = 'test'):
if __name__ == '__main__':
import pdb; pdb.set_trace()
train_segment('test')

View File

@ -3,6 +3,7 @@ from pysndfile import sndio as snd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyaudio as pa
sns.set() # Use seaborn's default style to make graphs more pretty
@ -84,8 +85,11 @@ def plot_sample_raw(sample_file='outputs/audio/sunflowers-Victoria-180-normal-87
plt.ylabel("amplitude")
plt.show()
def plot_sample_intensity(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
def plot_file_intensity(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
snd_d = pm_snd(sample_file)
plot_sample_intensity(snd_d)
def plot_sample_intensity(snd_d):
intensity = snd_d.to_intensity()
spectrogram = snd_d.to_spectrogram()
plt.figure()
@ -95,17 +99,41 @@ def plot_sample_intensity(sample_file='outputs/audio/sunflowers-Victoria-180-nor
plt.xlim([snd_d.xmin, snd_d.xmax])
plt.show()
def plot_sample_pitch(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
def plot_file_pitch(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
snd_d = pm_snd(sample_file)
plot_sample_pitch(snd_d)
def plot_sample_pitch(snd_d,phons = []):
pitch = snd_d.to_pitch()
spectrogram = snd_d.to_spectrogram(window_length=0.03, maximum_frequency=8000)
plt.figure()
draw_spectrogram(spectrogram)
plt.twinx()
draw_pitch(pitch)
for (p,c) in phons:
plt.axvline(x=p)
plt.text(p,-1,c)
plt.xlim([snd_d.xmin, snd_d.xmax])
plt.show()
def play_sound(samplerate=22050):
#snd_sample = pm_snd('outputs/test/a_warm_smile_and_a_good_heart-1917.aiff')
p_oup = pa.PyAudio()
stream = p_oup.open(
format=pa.paFloat32,
channels=2,
rate=samplerate,
output=True)
def sample_player(snd_sample=None):
samples = snd_sample.as_array()[:,0]
one_channel = np.asarray([samples, samples]).T.reshape(-1)
audio_data = one_channel.astype(np.float32).tobytes()
stream.write(audio_data)
def close_player():
stream.close()
p_oup.terminate()
return sample_player,close_player
# snd_part = snd_d.extract_part(from_time=0.9, preserve_times=True)
# plt.figure()
# plt.plot(snd_part.xs(), snd_part.values, linewidth=0.5)
@ -116,8 +144,16 @@ def plot_sample_pitch(sample_file='outputs/audio/sunflowers-Victoria-180-normal-
if __name__ == '__main__':
mom_snd = pm_snd('outputs/test/moms_are_engineers-7608.aiff')
plot_sample_pitch('outputs/audio/sunflowers-Victoria-180-normal-870.aiff')
plot_sample_pitch('inputs/self-apple/apple-low1.aiff')
plot_sample_pitch('inputs/self-apple/apple-low2.aiff')
plot_sample_pitch('inputs/self-apple/apple-medium1.aiff')
# mom_snd = pm_snd('outputs/test/moms_are_engineers-7608.aiff')
plot_file_pitch('outputs/audio/sunflowers-Victoria-180-normal-870.aiff')
plot_file_pitch('outputs/test/a_warm_smile_and_a_good_heart-1917.aiff')
play_sound(pm_snd('outputs/test/a_warm_smile_and_a_good_heart-1917.aiff'))
plot_file_pitch('outputs/test/a_wrong_turn-3763.aiff')
play_sound(pm_snd('outputs/test/a_wrong_turn-3763.aiff'))
plot_file_pitch('inputs/self/a_wrong_turn-low1.aiff')
play_sound(pm_snd('inputs/self/a_wrong_turn-low1.aiff'))
plot_file_pitch('inputs/self/a_wrong_turn-low2.aiff')
play_sound(pm_snd('inputs/self/a_wrong_turn-low2.aiff'))
plot_file_pitch('inputs/self/apple-low1.aiff')
plot_file_pitch('inputs/self/apple-low2.aiff')
plot_file_pitch('inputs/self/apple-medium1.aiff')

View File

@ -20,7 +20,7 @@ apple_phonemes = [
'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z'
]
OUTPUT_NAME = 'test'
OUTPUT_NAME = 'story_phrases_segments'
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
csv_dest_file = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '.csv'
@ -184,7 +184,7 @@ def story_texts():
def generate_audio():
synthQ = SynthesizerQueue()
phrases = random.sample(story_texts(), 100) # story_texts()
phrases = story_texts()#random.sample(story_texts(), 100) #
f = open(csv_dest_file, 'w')
s_csv_w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
i = 0