Merge branch 'master' of /Users/malarkannan/Public/repos/speech-scoring

master
Malar Kannan 2017-11-28 12:32:50 +05:30
commit 8f79316893
5 changed files with 142 additions and 40 deletions

View File

@ -8,6 +8,7 @@ distributed==1.19.3
entrypoints==0.2.3
enum34==1.1.6
futures==3.1.1
graphviz==0.8.1
h5py==2.7.1
HeapDict==1.0.0
html5lib==0.9999999
@ -41,12 +42,14 @@ partd==0.3.8
pexpect==4.2.1
pickleshare==0.7.4
pkg-resources==0.0.0
praat-parselmouth==0.2.0
progressbar2==3.34.3
prompt-toolkit==1.0.15
protobuf==3.4.0
protobuf==3.5.0
psutil==5.4.0
ptyprocess==0.5.2
PyAudio==0.2.11
pydot==1.2.3
Pygments==2.2.0
pyparsing==2.2.0
pysndfile==1.0.0
@ -65,7 +68,7 @@ sortedcontainers==1.5.7
tables==3.4.2
tblib==1.3.2
tensorflow==1.3.0
tensorflow-tensorboard==0.4.0rc1
tensorflow-tensorboard==0.4.0rc3
terminado==0.6
testpath==0.3.1
toolz==0.8.2

View File

@ -20,7 +20,7 @@ from tqdm import tqdm
def siamese_pairs(rightGroup, wrongGroup):
group1 = [r for (i, r) in rightGroup.iterrows()]
group2 = [r for (i, r) in wrongGroup.iterrows()]
rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]+[(g2, g1) for g2 in group2 for g1 in group1]
rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]#+[(g2, g1) for g2 in group2 for g1 in group1]
rightRightPairs = [i for i in itertools.permutations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)]
def filter_criteria(s1,s2):
same = s1['variant'] == s2['variant']
@ -28,8 +28,8 @@ def siamese_pairs(rightGroup, wrongGroup):
voice_diff = s1['voice'] != s2['voice']
if not same and phon_same:
return False
if same and not voice_diff:
return False
# if same and not voice_diff:
# return False
return True
validRWPairs = [i for i in rightWrongPairs if filter_criteria(*i)]
validRRPairs = [i for i in rightRightPairs if filter_criteria(*i)]
@ -64,8 +64,8 @@ def create_spectrogram_tfrecords(audio_group='audio',sample_count=0,train_test_r
for (w, word_group) in word_group_prog:
word_group_prog.set_postfix(word=w,sample_name=sample_name)
g = word_group.reset_index()
# g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram)
g['spectrogram'] = apply_by_multiprocessing(g['file_path'],compute_mfcc)
g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram)
# g['spectrogram'] = apply_by_multiprocessing(g['file_path'],compute_mfcc)
sample_right = g.loc[g['variant'] == 'low']
sample_wrong = g.loc[g['variant'] == 'medium']
same, diff = siamese_pairs(sample_right, sample_wrong)
@ -208,11 +208,17 @@ def audio_samples_word_count(audio_group='audio'):
def record_generator_count(records_file):
record_iterator = tf.python_io.tf_record_iterator(path=records_file)
count = 0
count,spec_n = 0,0
for i in record_iterator:
example = tf.train.Example()
example.ParseFromString(i)
spec_n1 = example.features.feature['spec_n1'].int64_list.value[0]
spec_n2 = example.features.feature['spec_n2'].int64_list.value[0]
spec_n = max([spec_n,spec_n1,spec_n2])
import pdb; pdb.set_trace()
count+=1
record_iterator = tf.python_io.tf_record_iterator(path=records_file)
return record_iterator,count
return record_iterator,count,spec_n
def fix_csv(audio_group='audio'):
audio_csv_lines = open('./outputs/' + audio_group + '.csv','r').readlines()
@ -253,8 +259,9 @@ if __name__ == '__main__':
# create_spectrogram_tfrecords('audio',sample_count=100)
# create_spectrogram_tfrecords('story_all',sample_count=25)
# fix_csv('story_words_test')
#fix_csv('story_phrases')
create_spectrogram_tfrecords('story_phrases',sample_count=100,train_test_ratio=0.1)
#fix_csv('audio')
# create_spectrogram_tfrecords('story_words_test',sample_count=100,train_test_ratio=0.1)
record_generator_count()
# create_spectrogram_tfrecords('audio',sample_count=50)
# read_siamese_tfrecords_generator('audio')
# padd_zeros_siamese_tfrecords('audio')

View File

@ -9,6 +9,7 @@ from keras.utils import to_categorical
from keras.optimizers import RMSprop
from keras.callbacks import TensorBoard, ModelCheckpoint
from keras import backend as K
from keras.utils import plot_model
from speech_tools import create_dir,step_count
@ -17,10 +18,10 @@ def create_base_rnn_network(input_dim):
'''
inp = Input(shape=input_dim)
# ls0 = LSTM(512, return_sequences=True)(inp)
ls1 = Bidirectional(LSTM(128, return_sequences=True))(inp)
ls2 = LSTM(128, return_sequences=True)(ls1)
ls1 = LSTM(128, return_sequences=True)(inp)
ls2 = LSTM(64, return_sequences=True)(ls1)
# ls3 = LSTM(32, return_sequences=True)(ls2)
ls4 = LSTM(64)(ls2)
ls4 = LSTM(32)(ls2)
# d1 = Dense(128, activation='relu')(ls4)
#d2 = Dense(64, activation='relu')(ls2)
return Model(inp, ls4)
@ -55,7 +56,7 @@ def siamese_model(input_dim):
processed_b = base_network(input_b)
final_output = dense_classifier([processed_a,processed_b])
model = Model([input_a, input_b], final_output)
return model
return model,base_network
def write_model_arch(mod,mod_file):
model_f = open(mod_file,'w')
@ -75,12 +76,13 @@ def train_siamese(audio_group = 'audio'):
log_dir = './logs/'+audio_group
create_dir(log_dir)
tr_gen_fn,te_pairs,te_y,copy_read_consts = read_siamese_tfrecords_generator(audio_group,batch_size=batch_size,test_size=batch_size)
n_step,n_features,n_records = copy_read_consts()
n_step,n_features,n_records = copy_read_consts(model_dir)
tr_gen = tr_gen_fn()
input_dim = (n_step, n_features)
model = siamese_model(input_dim)
model,base_model = siamese_model(input_dim)
plot_model(model,show_shapes=True, to_file=model_dir+'/model.png')
plot_model(base_model,show_shapes=True, to_file=model_dir+'/base_model.png')
tb_cb = TensorBoard(
log_dir=log_dir,
histogram_freq=1,

View File

@ -1,5 +1,9 @@
import parselmouth as pm
from pysndfile import sndio as snd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set() # Use seaborn's default style to make graphs more pretty
def pitch_array(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
samples, samplerate, _ = snd.read(sample_file)
@ -23,24 +27,110 @@ def compute_mfcc(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.a
# sample_mfcc.to_array().shape
return sample_mfcc.to_array()
# sunflowers_vic_180_norm = pitch_array('outputs/audio/sunflowers-Victoria-180-normal-870.aiff')
# sunflowers_fred_180_norm = pitch_array('outputs/audio/sunflowers-Fred-180-normal-6515.aiff')
# sunflowers_vic_180_norm_mfcc = compute_mfcc('outputs/audio/sunflowers-Victoria-180-normal-870.aiff')
fred_180_norm_mfcc = compute_mfcc('outputs/audio/sunflowers-Fred-180-normal-6515.aiff')
alex_mfcc = compute_mfcc('outputs/audio/sunflowers-Alex-180-normal-4763.aiff')
# # sunflowers_vic_180_norm.shape
# # sunflowers_fred_180_norm.shape
# alex_mfcc.shape
# sunflowers_vic_180_norm_mfcc.shape
# sunflowers_fred_180_norm_mfcc.shape
from speech_spectrum import generate_aiff_spectrogram
vic_spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Victoria-180-normal-870.aiff')
alex_spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Alex-180-normal-4763.aiff')
alex150spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Alex-150-normal-589.aiff')
vic_spec.shape
alex_spec.shape
alex150spec.shape
alex_mfcc.shape
fred_180_norm_mfcc.shape
# pm.SoundFileFormat
# pm.Pitch.get_number_of_frames()
def compute_formants(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'
samples, samplerate, _ = snd.read(sample_file)
sample_sound = pm.Sound(values=samples,sampling_frequency=samplerate)
sample_formant = sample_sound.to_formant_burg()
sample_formant.x_bins()
# sample_mfcc.to_array().shape
return sample_mfcc.to_array()
def draw_spectrogram(spectrogram, dynamic_range=70):
X, Y = spectrogram.x_grid(), spectrogram.y_grid()
sg_db = 10 * np.log10(spectrogram.values.T)
plt.pcolormesh(X, Y, sg_db, vmin=sg_db.max() - dynamic_range, cmap='afmhot')
plt.ylim([spectrogram.ymin, spectrogram.ymax])
plt.xlabel("time [s]")
plt.ylabel("frequency [Hz]")
def draw_intensity(intensity):
plt.plot(intensity.xs(), intensity.values, linewidth=3, color='w')
plt.plot(intensity.xs(), intensity.values, linewidth=1)
plt.grid(False)
plt.ylim(0)
plt.ylabel("intensity [dB]")
def draw_pitch(pitch):
# Extract selected pitch contour, and
# replace unvoiced samples by NaN to not plot
pitch_values = pitch.to_matrix().values
pitch_values[pitch_values==0] = np.nan
plt.plot(pitch.xs(), pitch_values, linewidth=3, color='w')
plt.plot(pitch.xs(), pitch_values, linewidth=1)
plt.grid(False)
plt.ylim(0, pitch.ceiling)
plt.ylabel("pitch [Hz]")
def pm_snd(sample_file):
# sample_file = 'inputs/self-apple/apple-low1.aiff'
samples, samplerate, _ = snd.read(sample_file)
return pm.Sound(values=samples,sampling_frequency=samplerate)
def plot_sample_raw(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
# %matplotlib inline
# sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff
snd_d = pm_snd(sample_file)
plt.figure()
plt.plot(snd_d.xs(), snd_d.values)
plt.xlim([snd_d.xmin, snd_d.xmax])
plt.xlabel("time [s]")
plt.ylabel("amplitude")
plt.show()
def plot_sample_intensity(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
snd_d = pm_snd(sample_file)
intensity = snd_d.to_intensity()
spectrogram = snd_d.to_spectrogram()
plt.figure()
draw_spectrogram(spectrogram)
plt.twinx()
draw_intensity(intensity)
plt.xlim([snd_d.xmin, snd_d.xmax])
plt.show()
def plot_sample_pitch(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
snd_d = pm_snd(sample_file)
pitch = snd_d.to_pitch()
spectrogram = snd_d.to_spectrogram(window_length=0.03, maximum_frequency=8000)
plt.figure()
draw_spectrogram(spectrogram)
plt.twinx()
draw_pitch(pitch)
plt.xlim([snd_d.xmin, snd_d.xmax])
plt.show()
# snd_part = snd_d.extract_part(from_time=0.9, preserve_times=True)
# plt.figure()
# plt.plot(snd_part.xs(), snd_part.values, linewidth=0.5)
# plt.xlim([snd_part.xmin, snd_part.xmax])
# plt.xlabel("time [s]")
# plt.ylabel("amplitude")
# plt.show()
if __name__ == '__main__':
# sunflowers_vic_180_norm = pitch_array('outputs/audio/sunflowers-Victoria-180-normal-870.aiff')
# sunflowers_fred_180_norm = pitch_array('outputs/audio/sunflowers-Fred-180-normal-6515.aiff')
# sunflowers_vic_180_norm_mfcc = compute_mfcc('outputs/audio/sunflowers-Victoria-180-normal-870.aiff')
# fred_180_norm_mfcc = compute_mfcc('outputs/audio/sunflowers-Fred-180-normal-6515.aiff')
# alex_mfcc = compute_mfcc('outputs/audio/sunflowers-Alex-180-normal-4763.aiff')
# # # sunflowers_vic_180_norm.shape
# # # sunflowers_fred_180_norm.shape
# # alex_mfcc.shape
# # sunflowers_vic_180_norm_mfcc.shape
# # sunflowers_fred_180_norm_mfcc.shape
# from speech_spectrum import generate_aiff_spectrogram
# vic_spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Victoria-180-normal-870.aiff')
# alex_spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Alex-180-normal-4763.aiff')
# alex150spec = generate_aiff_spectrogram('outputs/audio/sunflowers-Alex-150-normal-589.aiff')
# vic_spec.shape
# alex_spec.shape
# alex150spec.shape
# alex_mfcc.shape
# fred_180_norm_mfcc.shape
plot_sample_pitch('outputs/audio/sunflowers-Victoria-180-normal-870.aiff')
plot_sample_pitch('inputs/self-apple/apple-low1.aiff')
plot_sample_pitch('inputs/self-apple/apple-low2.aiff')
plot_sample_pitch('inputs/self-apple/apple-medium1.aiff')
# pm.SoundFileFormat
# pm.Pitch.get_number_of_frames()

View File

@ -178,7 +178,7 @@ def visualize_results(audio_group='audio'):
if __name__ == '__main__':
# evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_words.gpu',weights ='siamese_speech_model-58-epoch-0.00-acc.h5')
# evaluate_siamese('./outputs/story_words.test.tfrecords',audio_group='story_words',weights ='siamese_speech_model-675-epoch-0.00-acc.h5')
evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_phrases',weights ='siamese_speech_model-231-epoch-0.00-acc.h5')
evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_words.gpu',weights ='siamese_speech_model-58-epoch-0.00-acc.h5')
# play_results('story_words')
#inspect_tfrecord('./outputs/story_phrases.test.tfrecords',audio_group='story_phrases')
# visualize_results('story_words.gpu')