implemented segmentation visualization
parent
0b1152b5c3
commit
6ef4e86f41
|
|
@ -1,12 +1,77 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import random
|
||||||
|
from functools import reduce
|
||||||
|
from speech_pitch import *
|
||||||
|
# %matplotlib inline
|
||||||
|
|
||||||
def fix_csv(collection_name = 'test'):
|
def fix_csv(collection_name = 'test'):
|
||||||
seg_data = pd.read_csv('./outputs/'+collection_name+'.csv',names=['phrase','filename'
|
seg_data = pd.read_csv('./outputs/'+collection_name+'.csv',names=['phrase','filename'
|
||||||
,'start_phoneme','end_phoneme','start_time','end_time'])
|
,'start_phoneme','end_phoneme','start_time','end_time'])
|
||||||
seg_data.to_csv('./outputs/'+collection_name+'.fixed.csv')
|
seg_data.to_csv('./outputs/'+collection_name+'.fixed.csv')
|
||||||
|
|
||||||
|
def pick_random_phrases(collection_name='test'):
|
||||||
def segment_data_gen(collection_name = 'test'):
|
|
||||||
collection_name = 'test'
|
collection_name = 'test'
|
||||||
seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0)
|
seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0)
|
||||||
|
phrase_groups = random.sample([i for i in seg_data.groupby(['phrase'])],10)
|
||||||
|
result = []
|
||||||
|
for ph,g in phrase_groups:
|
||||||
|
result.append(ph)
|
||||||
|
pd.DataFrame(result,columns=['phrase']).to_csv('./outputs/'+collection_name+'.random.csv')
|
||||||
|
|
||||||
|
# pick_random_phrases()
|
||||||
|
|
||||||
|
def plot_random_phrases(collection_name = 'test'):
|
||||||
|
collection_name = 'test'
|
||||||
|
rand_words = pd.read_csv('./outputs/'+collection_name+'.random.csv',index_col=0)
|
||||||
|
rand_w_list = rand_words['phrase'].tolist()
|
||||||
|
seg_data = pd.read_csv('./outputs/'+collection_name+'.fixed.csv',index_col=0)
|
||||||
|
result = (seg_data['phrase'] == rand_w_list[0])
|
||||||
|
for i in rand_w_list[1:]:
|
||||||
|
result |= (seg_data['phrase'] == i)
|
||||||
|
# seg_data[result]
|
||||||
|
phrase_groups = [i for i in seg_data[result].groupby(['phrase'])]
|
||||||
|
self_files = ['a_wrong_turn-low1.aiff','great_pin-low1.aiff'
|
||||||
|
,'he_set_off_at_once_to_find_the_beast-low1.aiff'
|
||||||
|
,'hound-low1.aiff','noises-low1.aiff','po_burped-low1.aiff'
|
||||||
|
,'she_loves_the_roses-low1.aiff','the_busy_spider-low1.aiff'
|
||||||
|
,'the_rain_helped-low1.aiff','to_go_to_the_doctor-low1.aiff']
|
||||||
|
co_files = map(lambda x: './inputs/self/'+x,self_files)
|
||||||
|
|
||||||
|
for ((ph,g),s_f) in zip(phrase_groups,co_files):
|
||||||
|
# ph,g = phrase_groups[0]
|
||||||
|
file_path = './outputs/test/'+g.iloc[0]['filename']
|
||||||
|
phrase_sample = pm_snd(file_path)
|
||||||
|
self_sample = pm_snd(s_f)
|
||||||
|
player,closer = play_sound()
|
||||||
|
# rows = [i for i in g.iterrows()]
|
||||||
|
# random.shuffle(rows)
|
||||||
|
print(ph)
|
||||||
|
phon_stops = []
|
||||||
|
for (i,phon) in g.iterrows():
|
||||||
|
end_t = phon['end_time']/1000
|
||||||
|
phon_ch = phon['start_phoneme']
|
||||||
|
phon_stops.append((end_t,phon_ch))
|
||||||
|
plot_sample_pitch(phrase_sample,phons = phon_stops)
|
||||||
|
plot_sample_pitch(self_sample)
|
||||||
|
# player(phrase_sample)
|
||||||
|
# input()
|
||||||
|
# for (i,phon) in g.iterrows():
|
||||||
|
# # phon = g.iloc[1]
|
||||||
|
# start_t = phon['start_time']/1000
|
||||||
|
# end_t = phon['end_time']/1000
|
||||||
|
# phon_ch = phon['start_phoneme']
|
||||||
|
# phon_sample = phrase_sample.extract_part(from_time=start_t,to_time=end_t)
|
||||||
|
# if phon_sample.n_samples*phon_sample.sampling_period < 6.4/100:
|
||||||
|
# continue
|
||||||
|
# # if phon_ch[0] not in 'AEIOU':
|
||||||
|
# # continue
|
||||||
|
# # phon_sample
|
||||||
|
# # player(phon_sample)
|
||||||
|
# # plot_sample_intensity(phon_sample)
|
||||||
|
# print(phon_ch)
|
||||||
|
# plot_sample_pitch(phon_sample)
|
||||||
|
closer()
|
||||||
|
# print(phg)#['start_phoneme'],g['start_time'])
|
||||||
|
|
||||||
|
plot_random_phrases()
|
||||||
|
|
|
||||||
|
|
@ -22,18 +22,30 @@ def accuracy(y_true, y_pred):
|
||||||
'''
|
'''
|
||||||
return K.mean(K.equal(y_true, K.cast(y_pred > 0.5, y_true.dtype)))
|
return K.mean(K.equal(y_true, K.cast(y_pred > 0.5, y_true.dtype)))
|
||||||
|
|
||||||
|
def ctc_lambda_func(args):
|
||||||
|
y_pred, labels, input_length, label_length = args
|
||||||
|
# the 2 is critical here since the first couple outputs of the RNN
|
||||||
|
# tend to be garbage:
|
||||||
|
y_pred = y_pred[:, 2:, :]
|
||||||
|
return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
|
||||||
|
|
||||||
def segment_model(input_dim):
|
def segment_model(input_dim):
|
||||||
|
input_dim = (100,100,1)
|
||||||
inp = Input(shape=input_dim)
|
inp = Input(shape=input_dim)
|
||||||
# ls0 = LSTM(512, return_sequences=True)(inp)
|
cnv1 = Conv2D(filters=32, kernel_size=(5,9))(inp)
|
||||||
cnv1 = Conv2D(filters=512, kernel_size=(5,9))(inp)
|
|
||||||
cnv2 = Conv2D(filters=1, kernel_size=(5,9))(cnv1)
|
cnv2 = Conv2D(filters=1, kernel_size=(5,9))(cnv1)
|
||||||
dr_cnv2 = Dropout(rate=0.95)(cnv2)
|
dr_cnv2 = Dropout(rate=0.95)(cnv2)
|
||||||
|
# dr_cnv2
|
||||||
cn_rnn_dim = (dr_cnv2.shape[1].value,dr_cnv2.shape[2].value)
|
cn_rnn_dim = (dr_cnv2.shape[1].value,dr_cnv2.shape[2].value)
|
||||||
r_dr_cnv2 = Reshape(target_shape=cn_rnn_dim)(dr_cnv2)
|
r_dr_cnv2 = Reshape(target_shape=cn_rnn_dim)(dr_cnv2)
|
||||||
b_gr1 = Bidirectional(GRU(512, return_sequences=True))(r_dr_cnv2)
|
b_gr1 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(r_dr_cnv2)
|
||||||
b_gr2 = Bidirectional(GRU(512, return_sequences=True))(b_gr1)
|
# b_gr1
|
||||||
b_gr3 = Bidirectional(GRU(512))(b_gr2)
|
b_gr2 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(b_gr1)
|
||||||
return Model(inp, b_gr3)
|
b_gr3 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(b_gr2)
|
||||||
|
# b_gr3
|
||||||
|
oup = Dense(2, activation='softmax')(b_gr3)
|
||||||
|
# oup
|
||||||
|
return Model(inp, oup)
|
||||||
|
|
||||||
def write_model_arch(mod,mod_file):
|
def write_model_arch(mod,mod_file):
|
||||||
model_f = open(mod_file,'w')
|
model_f = open(mod_file,'w')
|
||||||
|
|
@ -58,7 +70,7 @@ def train_segment(collection_name = 'test'):
|
||||||
|
|
||||||
model = segment_model(input_dim)
|
model = segment_model(input_dim)
|
||||||
plot_model(model,show_shapes=True, to_file=model_dir+'/model.png')
|
plot_model(model,show_shapes=True, to_file=model_dir+'/model.png')
|
||||||
|
# loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])
|
||||||
tb_cb = TensorBoard(
|
tb_cb = TensorBoard(
|
||||||
log_dir=log_dir,
|
log_dir=log_dir,
|
||||||
histogram_freq=1,
|
histogram_freq=1,
|
||||||
|
|
@ -100,5 +112,4 @@ def train_segment(collection_name = 'test'):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import pdb; pdb.set_trace()
|
|
||||||
train_segment('test')
|
train_segment('test')
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ from pysndfile import sndio as snd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
|
import pyaudio as pa
|
||||||
sns.set() # Use seaborn's default style to make graphs more pretty
|
sns.set() # Use seaborn's default style to make graphs more pretty
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -84,8 +85,11 @@ def plot_sample_raw(sample_file='outputs/audio/sunflowers-Victoria-180-normal-87
|
||||||
plt.ylabel("amplitude")
|
plt.ylabel("amplitude")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def plot_sample_intensity(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
|
def plot_file_intensity(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
|
||||||
snd_d = pm_snd(sample_file)
|
snd_d = pm_snd(sample_file)
|
||||||
|
plot_sample_intensity(snd_d)
|
||||||
|
|
||||||
|
def plot_sample_intensity(snd_d):
|
||||||
intensity = snd_d.to_intensity()
|
intensity = snd_d.to_intensity()
|
||||||
spectrogram = snd_d.to_spectrogram()
|
spectrogram = snd_d.to_spectrogram()
|
||||||
plt.figure()
|
plt.figure()
|
||||||
|
|
@ -95,17 +99,41 @@ def plot_sample_intensity(sample_file='outputs/audio/sunflowers-Victoria-180-nor
|
||||||
plt.xlim([snd_d.xmin, snd_d.xmax])
|
plt.xlim([snd_d.xmin, snd_d.xmax])
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def plot_sample_pitch(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
|
def plot_file_pitch(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
|
||||||
snd_d = pm_snd(sample_file)
|
snd_d = pm_snd(sample_file)
|
||||||
|
plot_sample_pitch(snd_d)
|
||||||
|
|
||||||
|
def plot_sample_pitch(snd_d,phons = []):
|
||||||
pitch = snd_d.to_pitch()
|
pitch = snd_d.to_pitch()
|
||||||
spectrogram = snd_d.to_spectrogram(window_length=0.03, maximum_frequency=8000)
|
spectrogram = snd_d.to_spectrogram(window_length=0.03, maximum_frequency=8000)
|
||||||
plt.figure()
|
plt.figure()
|
||||||
draw_spectrogram(spectrogram)
|
draw_spectrogram(spectrogram)
|
||||||
plt.twinx()
|
plt.twinx()
|
||||||
draw_pitch(pitch)
|
draw_pitch(pitch)
|
||||||
|
for (p,c) in phons:
|
||||||
|
plt.axvline(x=p)
|
||||||
|
plt.text(p,-1,c)
|
||||||
plt.xlim([snd_d.xmin, snd_d.xmax])
|
plt.xlim([snd_d.xmin, snd_d.xmax])
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
def play_sound(samplerate=22050):
|
||||||
|
#snd_sample = pm_snd('outputs/test/a_warm_smile_and_a_good_heart-1917.aiff')
|
||||||
|
p_oup = pa.PyAudio()
|
||||||
|
stream = p_oup.open(
|
||||||
|
format=pa.paFloat32,
|
||||||
|
channels=2,
|
||||||
|
rate=samplerate,
|
||||||
|
output=True)
|
||||||
|
def sample_player(snd_sample=None):
|
||||||
|
samples = snd_sample.as_array()[:,0]
|
||||||
|
|
||||||
|
one_channel = np.asarray([samples, samples]).T.reshape(-1)
|
||||||
|
audio_data = one_channel.astype(np.float32).tobytes()
|
||||||
|
stream.write(audio_data)
|
||||||
|
def close_player():
|
||||||
|
stream.close()
|
||||||
|
p_oup.terminate()
|
||||||
|
return sample_player,close_player
|
||||||
# snd_part = snd_d.extract_part(from_time=0.9, preserve_times=True)
|
# snd_part = snd_d.extract_part(from_time=0.9, preserve_times=True)
|
||||||
# plt.figure()
|
# plt.figure()
|
||||||
# plt.plot(snd_part.xs(), snd_part.values, linewidth=0.5)
|
# plt.plot(snd_part.xs(), snd_part.values, linewidth=0.5)
|
||||||
|
|
@ -116,8 +144,16 @@ def plot_sample_pitch(sample_file='outputs/audio/sunflowers-Victoria-180-normal-
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
mom_snd = pm_snd('outputs/test/moms_are_engineers-7608.aiff')
|
# mom_snd = pm_snd('outputs/test/moms_are_engineers-7608.aiff')
|
||||||
plot_sample_pitch('outputs/audio/sunflowers-Victoria-180-normal-870.aiff')
|
plot_file_pitch('outputs/audio/sunflowers-Victoria-180-normal-870.aiff')
|
||||||
plot_sample_pitch('inputs/self-apple/apple-low1.aiff')
|
plot_file_pitch('outputs/test/a_warm_smile_and_a_good_heart-1917.aiff')
|
||||||
plot_sample_pitch('inputs/self-apple/apple-low2.aiff')
|
play_sound(pm_snd('outputs/test/a_warm_smile_and_a_good_heart-1917.aiff'))
|
||||||
plot_sample_pitch('inputs/self-apple/apple-medium1.aiff')
|
plot_file_pitch('outputs/test/a_wrong_turn-3763.aiff')
|
||||||
|
play_sound(pm_snd('outputs/test/a_wrong_turn-3763.aiff'))
|
||||||
|
plot_file_pitch('inputs/self/a_wrong_turn-low1.aiff')
|
||||||
|
play_sound(pm_snd('inputs/self/a_wrong_turn-low1.aiff'))
|
||||||
|
plot_file_pitch('inputs/self/a_wrong_turn-low2.aiff')
|
||||||
|
play_sound(pm_snd('inputs/self/a_wrong_turn-low2.aiff'))
|
||||||
|
plot_file_pitch('inputs/self/apple-low1.aiff')
|
||||||
|
plot_file_pitch('inputs/self/apple-low2.aiff')
|
||||||
|
plot_file_pitch('inputs/self/apple-medium1.aiff')
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ apple_phonemes = [
|
||||||
'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z'
|
'l', 'm', 'n', 'N', 'p', 'r', 's', 'S', 't', 'T', 'v', 'w', 'y', 'z', 'Z'
|
||||||
]
|
]
|
||||||
|
|
||||||
OUTPUT_NAME = 'test'
|
OUTPUT_NAME = 'story_phrases_segments'
|
||||||
|
|
||||||
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
|
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
|
||||||
csv_dest_file = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '.csv'
|
csv_dest_file = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '.csv'
|
||||||
|
|
@ -184,7 +184,7 @@ def story_texts():
|
||||||
|
|
||||||
def generate_audio():
|
def generate_audio():
|
||||||
synthQ = SynthesizerQueue()
|
synthQ = SynthesizerQueue()
|
||||||
phrases = random.sample(story_texts(), 100) # story_texts()
|
phrases = story_texts()#random.sample(story_texts(), 100) #
|
||||||
f = open(csv_dest_file, 'w')
|
f = open(csv_dest_file, 'w')
|
||||||
s_csv_w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
|
s_csv_w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
|
||||||
i = 0
|
i = 0
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue