Merge branch 'master' of /Users/malarkannan/Public/repos/speech-scoring

master
Malar Kannan 2017-11-28 12:32:50 +05:30
commit 8f79316893
5 changed files with 142 additions and 40 deletions

View File

@ -8,6 +8,7 @@ distributed==1.19.3
entrypoints==0.2.3 entrypoints==0.2.3
enum34==1.1.6 enum34==1.1.6
futures==3.1.1 futures==3.1.1
graphviz==0.8.1
h5py==2.7.1 h5py==2.7.1
HeapDict==1.0.0 HeapDict==1.0.0
html5lib==0.9999999 html5lib==0.9999999
@ -41,12 +42,14 @@ partd==0.3.8
pexpect==4.2.1 pexpect==4.2.1
pickleshare==0.7.4 pickleshare==0.7.4
pkg-resources==0.0.0 pkg-resources==0.0.0
praat-parselmouth==0.2.0
progressbar2==3.34.3 progressbar2==3.34.3
prompt-toolkit==1.0.15 prompt-toolkit==1.0.15
protobuf==3.4.0 protobuf==3.5.0
psutil==5.4.0 psutil==5.4.0
ptyprocess==0.5.2 ptyprocess==0.5.2
PyAudio==0.2.11 PyAudio==0.2.11
pydot==1.2.3
Pygments==2.2.0 Pygments==2.2.0
pyparsing==2.2.0 pyparsing==2.2.0
pysndfile==1.0.0 pysndfile==1.0.0
@ -65,7 +68,7 @@ sortedcontainers==1.5.7
tables==3.4.2 tables==3.4.2
tblib==1.3.2 tblib==1.3.2
tensorflow==1.3.0 tensorflow==1.3.0
tensorflow-tensorboard==0.4.0rc1 tensorflow-tensorboard==0.4.0rc3
terminado==0.6 terminado==0.6
testpath==0.3.1 testpath==0.3.1
toolz==0.8.2 toolz==0.8.2

View File

@ -20,7 +20,7 @@ from tqdm import tqdm
def siamese_pairs(rightGroup, wrongGroup): def siamese_pairs(rightGroup, wrongGroup):
group1 = [r for (i, r) in rightGroup.iterrows()] group1 = [r for (i, r) in rightGroup.iterrows()]
group2 = [r for (i, r) in wrongGroup.iterrows()] group2 = [r for (i, r) in wrongGroup.iterrows()]
rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]+[(g2, g1) for g2 in group2 for g1 in group1] rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]#+[(g2, g1) for g2 in group2 for g1 in group1]
rightRightPairs = [i for i in itertools.permutations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)] rightRightPairs = [i for i in itertools.permutations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)]
def filter_criteria(s1,s2): def filter_criteria(s1,s2):
same = s1['variant'] == s2['variant'] same = s1['variant'] == s2['variant']
@ -28,8 +28,8 @@ def siamese_pairs(rightGroup, wrongGroup):
voice_diff = s1['voice'] != s2['voice'] voice_diff = s1['voice'] != s2['voice']
if not same and phon_same: if not same and phon_same:
return False return False
if same and not voice_diff: # if same and not voice_diff:
return False # return False
return True return True
validRWPairs = [i for i in rightWrongPairs if filter_criteria(*i)] validRWPairs = [i for i in rightWrongPairs if filter_criteria(*i)]
validRRPairs = [i for i in rightRightPairs if filter_criteria(*i)] validRRPairs = [i for i in rightRightPairs if filter_criteria(*i)]
@ -64,8 +64,8 @@ def create_spectrogram_tfrecords(audio_group='audio',sample_count=0,train_test_r
for (w, word_group) in word_group_prog: for (w, word_group) in word_group_prog:
word_group_prog.set_postfix(word=w,sample_name=sample_name) word_group_prog.set_postfix(word=w,sample_name=sample_name)
g = word_group.reset_index() g = word_group.reset_index()
# g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram)
g['spectrogram'] = apply_by_multiprocessing(g['file_path'],compute_mfcc) # g['spectrogram'] = apply_by_multiprocessing(g['file_path'],compute_mfcc)
sample_right = g.loc[g['variant'] == 'low'] sample_right = g.loc[g['variant'] == 'low']
sample_wrong = g.loc[g['variant'] == 'medium'] sample_wrong = g.loc[g['variant'] == 'medium']
same, diff = siamese_pairs(sample_right, sample_wrong) same, diff = siamese_pairs(sample_right, sample_wrong)
@ -208,11 +208,17 @@ def audio_samples_word_count(audio_group='audio'):
def record_generator_count(records_file): def record_generator_count(records_file):
record_iterator = tf.python_io.tf_record_iterator(path=records_file) record_iterator = tf.python_io.tf_record_iterator(path=records_file)
count = 0 count,spec_n = 0,0
for i in record_iterator: for i in record_iterator:
example = tf.train.Example()
example.ParseFromString(i)
spec_n1 = example.features.feature['spec_n1'].int64_list.value[0]
spec_n2 = example.features.feature['spec_n2'].int64_list.value[0]
spec_n = max([spec_n,spec_n1,spec_n2])
import pdb; pdb.set_trace()
count+=1 count+=1
record_iterator = tf.python_io.tf_record_iterator(path=records_file) record_iterator = tf.python_io.tf_record_iterator(path=records_file)
return record_iterator,count return record_iterator,count,spec_n
def fix_csv(audio_group='audio'): def fix_csv(audio_group='audio'):
audio_csv_lines = open('./outputs/' + audio_group + '.csv','r').readlines() audio_csv_lines = open('./outputs/' + audio_group + '.csv','r').readlines()
@ -253,8 +259,9 @@ if __name__ == '__main__':
# create_spectrogram_tfrecords('audio',sample_count=100) # create_spectrogram_tfrecords('audio',sample_count=100)
# create_spectrogram_tfrecords('story_all',sample_count=25) # create_spectrogram_tfrecords('story_all',sample_count=25)
# fix_csv('story_words_test') # fix_csv('story_words_test')
#fix_csv('story_phrases') #fix_csv('audio')
create_spectrogram_tfrecords('story_phrases',sample_count=100,train_test_ratio=0.1) # create_spectrogram_tfrecords('story_words_test',sample_count=100,train_test_ratio=0.1)
record_generator_count()
# create_spectrogram_tfrecords('audio',sample_count=50) # create_spectrogram_tfrecords('audio',sample_count=50)
# read_siamese_tfrecords_generator('audio') # read_siamese_tfrecords_generator('audio')
# padd_zeros_siamese_tfrecords('audio') # padd_zeros_siamese_tfrecords('audio')

View File

@ -9,6 +9,7 @@ from keras.utils import to_categorical
from keras.optimizers import RMSprop from keras.optimizers import RMSprop
from keras.callbacks import TensorBoard, ModelCheckpoint from keras.callbacks import TensorBoard, ModelCheckpoint
from keras import backend as K from keras import backend as K
from keras.utils import plot_model
from speech_tools import create_dir,step_count from speech_tools import create_dir,step_count
@ -17,10 +18,10 @@ def create_base_rnn_network(input_dim):
''' '''
inp = Input(shape=input_dim) inp = Input(shape=input_dim)
# ls0 = LSTM(512, return_sequences=True)(inp) # ls0 = LSTM(512, return_sequences=True)(inp)
ls1 = Bidirectional(LSTM(128, return_sequences=True))(inp) ls1 = LSTM(128, return_sequences=True)(inp)
ls2 = LSTM(128, return_sequences=True)(ls1) ls2 = LSTM(64, return_sequences=True)(ls1)
# ls3 = LSTM(32, return_sequences=True)(ls2) # ls3 = LSTM(32, return_sequences=True)(ls2)
ls4 = LSTM(64)(ls2) ls4 = LSTM(32)(ls2)
# d1 = Dense(128, activation='relu')(ls4) # d1 = Dense(128, activation='relu')(ls4)
#d2 = Dense(64, activation='relu')(ls2) #d2 = Dense(64, activation='relu')(ls2)
return Model(inp, ls4) return Model(inp, ls4)
@ -55,7 +56,7 @@ def siamese_model(input_dim):
processed_b = base_network(input_b) processed_b = base_network(input_b)
final_output = dense_classifier([processed_a,processed_b]) final_output = dense_classifier([processed_a,processed_b])
model = Model([input_a, input_b], final_output) model = Model([input_a, input_b], final_output)
return model return model,base_network
def write_model_arch(mod,mod_file): def write_model_arch(mod,mod_file):
model_f = open(mod_file,'w') model_f = open(mod_file,'w')
@ -75,12 +76,13 @@ def train_siamese(audio_group = 'audio'):
log_dir = './logs/'+audio_group log_dir = './logs/'+audio_group
create_dir(log_dir) create_dir(log_dir)
tr_gen_fn,te_pairs,te_y,copy_read_consts = read_siamese_tfrecords_generator(audio_group,batch_size=batch_size,test_size=batch_size) tr_gen_fn,te_pairs,te_y,copy_read_consts = read_siamese_tfrecords_generator(audio_group,batch_size=batch_size,test_size=batch_size)
n_step,n_features,n_records = copy_read_consts() n_step,n_features,n_records = copy_read_consts(model_dir)
tr_gen = tr_gen_fn() tr_gen = tr_gen_fn()
input_dim = (n_step, n_features) input_dim = (n_step, n_features)
model = siamese_model(input_dim) model,base_model = siamese_model(input_dim)
plot_model(model,show_shapes=True, to_file=model_dir+'/model.png')
plot_model(base_model,show_shapes=True, to_file=model_dir+'/base_model.png')
tb_cb = TensorBoard( tb_cb = TensorBoard(
log_dir=log_dir, log_dir=log_dir,
histogram_freq=1, histogram_freq=1,

View File

@ -1,5 +1,9 @@
import parselmouth as pm import parselmouth as pm
from pysndfile import sndio as snd from pysndfile import sndio as snd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set() # Use seaborn's default style to make graphs more pretty
def pitch_array(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'): def pitch_array(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
samples, samplerate, _ = snd.read(sample_file) samples, samplerate, _ = snd.read(sample_file)
@ -23,24 +27,110 @@ def compute_mfcc(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.a
# sample_mfcc.to_array().shape # sample_mfcc.to_array().shape
return sample_mfcc.to_array() return sample_mfcc.to_array()
# sunflowers_vic_180_norm = pitch_array('outputs/audio/sunflowers-Victoria-180-normal-870.aiff') def compute_formants(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
# sunflowers_fred_180_norm = pitch_array('outputs/audio/sunflowers-Fred-180-normal-6515.aiff') sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'
# sunflowers_vic_180_norm_mfcc = compute_mfcc('outputs/audio/sunflowers-Victoria-180-normal-870.aiff') samples, samplerate, _ = snd.read(sample_file)
fred_180_norm_mfcc = compute_mfcc('outputs/audio/sunflowers-Fred-180-normal-6515.aiff') sample_sound = pm.Sound(values=samples,sampling_frequency=samplerate)
alex_mfcc = compute_mfcc('outputs/audio/sunflowers-Alex-180-normal-4763.aiff') sample_formant = sample_sound.to_formant_burg()
# # sunflowers_vic_180_norm.shape sample_formant.x_bins()
# # sunflowers_fred_180_norm.shape # sample_mfcc.to_array().shape
# alex_mfcc.shape return sample_mfcc.to_array()
# sunflowers_vic_180_norm_mfcc.shape
# sunflowers_fred_180_norm_mfcc.shape def draw_spectrogram(spectrogram, dynamic_range=70):
from speech_spectrum import generate_aiff_spectrogram X, Y = spectrogram.x_grid(), spectrogram.y_grid()
vic_spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Victoria-180-normal-870.aiff') sg_db = 10 * np.log10(spectrogram.values.T)
alex_spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Alex-180-normal-4763.aiff') plt.pcolormesh(X, Y, sg_db, vmin=sg_db.max() - dynamic_range, cmap='afmhot')
alex150spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Alex-150-normal-589.aiff') plt.ylim([spectrogram.ymin, spectrogram.ymax])
vic_spec.shape plt.xlabel("time [s]")
alex_spec.shape plt.ylabel("frequency [Hz]")
alex150spec.shape
alex_mfcc.shape def draw_intensity(intensity):
fred_180_norm_mfcc.shape plt.plot(intensity.xs(), intensity.values, linewidth=3, color='w')
# pm.SoundFileFormat plt.plot(intensity.xs(), intensity.values, linewidth=1)
# pm.Pitch.get_number_of_frames() plt.grid(False)
plt.ylim(0)
plt.ylabel("intensity [dB]")
def draw_pitch(pitch):
# Extract selected pitch contour, and
# replace unvoiced samples by NaN to not plot
pitch_values = pitch.to_matrix().values
pitch_values[pitch_values==0] = np.nan
plt.plot(pitch.xs(), pitch_values, linewidth=3, color='w')
plt.plot(pitch.xs(), pitch_values, linewidth=1)
plt.grid(False)
plt.ylim(0, pitch.ceiling)
plt.ylabel("pitch [Hz]")
def pm_snd(sample_file):
# sample_file = 'inputs/self-apple/apple-low1.aiff'
samples, samplerate, _ = snd.read(sample_file)
return pm.Sound(values=samples,sampling_frequency=samplerate)
def plot_sample_raw(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
# %matplotlib inline
# sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff
snd_d = pm_snd(sample_file)
plt.figure()
plt.plot(snd_d.xs(), snd_d.values)
plt.xlim([snd_d.xmin, snd_d.xmax])
plt.xlabel("time [s]")
plt.ylabel("amplitude")
plt.show()
def plot_sample_intensity(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
snd_d = pm_snd(sample_file)
intensity = snd_d.to_intensity()
spectrogram = snd_d.to_spectrogram()
plt.figure()
draw_spectrogram(spectrogram)
plt.twinx()
draw_intensity(intensity)
plt.xlim([snd_d.xmin, snd_d.xmax])
plt.show()
def plot_sample_pitch(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
snd_d = pm_snd(sample_file)
pitch = snd_d.to_pitch()
spectrogram = snd_d.to_spectrogram(window_length=0.03, maximum_frequency=8000)
plt.figure()
draw_spectrogram(spectrogram)
plt.twinx()
draw_pitch(pitch)
plt.xlim([snd_d.xmin, snd_d.xmax])
plt.show()
# snd_part = snd_d.extract_part(from_time=0.9, preserve_times=True)
# plt.figure()
# plt.plot(snd_part.xs(), snd_part.values, linewidth=0.5)
# plt.xlim([snd_part.xmin, snd_part.xmax])
# plt.xlabel("time [s]")
# plt.ylabel("amplitude")
# plt.show()
if __name__ == '__main__':
# sunflowers_vic_180_norm = pitch_array('outputs/audio/sunflowers-Victoria-180-normal-870.aiff')
# sunflowers_fred_180_norm = pitch_array('outputs/audio/sunflowers-Fred-180-normal-6515.aiff')
# sunflowers_vic_180_norm_mfcc = compute_mfcc('outputs/audio/sunflowers-Victoria-180-normal-870.aiff')
# fred_180_norm_mfcc = compute_mfcc('outputs/audio/sunflowers-Fred-180-normal-6515.aiff')
# alex_mfcc = compute_mfcc('outputs/audio/sunflowers-Alex-180-normal-4763.aiff')
# # # sunflowers_vic_180_norm.shape
# # # sunflowers_fred_180_norm.shape
# # alex_mfcc.shape
# # sunflowers_vic_180_norm_mfcc.shape
# # sunflowers_fred_180_norm_mfcc.shape
# from speech_spectrum import generate_aiff_spectrogram
# vic_spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Victoria-180-normal-870.aiff')
# alex_spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Alex-180-normal-4763.aiff')
# alex150spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Alex-150-normal-589.aiff')
# vic_spec.shape
# alex_spec.shape
# alex150spec.shape
# alex_mfcc.shape
# fred_180_norm_mfcc.shape
plot_sample_pitch('outputs/audio/sunflowers-Victoria-180-normal-870.aiff')
plot_sample_pitch('inputs/self-apple/apple-low1.aiff')
plot_sample_pitch('inputs/self-apple/apple-low2.aiff')
plot_sample_pitch('inputs/self-apple/apple-medium1.aiff')
# pm.SoundFileFormat
# pm.Pitch.get_number_of_frames()

View File

@ -178,7 +178,7 @@ def visualize_results(audio_group='audio'):
if __name__ == '__main__': if __name__ == '__main__':
# evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_words.gpu',weights ='siamese_speech_model-58-epoch-0.00-acc.h5') # evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_words.gpu',weights ='siamese_speech_model-58-epoch-0.00-acc.h5')
# evaluate_siamese('./outputs/story_words.test.tfrecords',audio_group='story_words',weights ='siamese_speech_model-675-epoch-0.00-acc.h5') # evaluate_siamese('./outputs/story_words.test.tfrecords',audio_group='story_words',weights ='siamese_speech_model-675-epoch-0.00-acc.h5')
evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_phrases',weights ='siamese_speech_model-231-epoch-0.00-acc.h5') evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_words.gpu',weights ='siamese_speech_model-58-epoch-0.00-acc.h5')
# play_results('story_words') # play_results('story_words')
#inspect_tfrecord('./outputs/story_phrases.test.tfrecords',audio_group='story_phrases') #inspect_tfrecord('./outputs/story_phrases.test.tfrecords',audio_group='story_phrases')
# visualize_results('story_words.gpu') # visualize_results('story_words.gpu')