Compare commits

...

14 Commits

Author SHA1 Message Date
Malar Kannan e4b8b4e0a7 visualizing and playing sound files where prediction fails 2017-11-13 19:22:30 +05:30
Malar Kannan 988f66c2c2 avoiding same voice similar variants 2017-11-13 17:33:37 +05:30
Malar Kannan d978272bdb saving model and tensorboard
checkpointing model
2017-11-10 18:09:14 +05:30
Malar Kannan bb72c4045e trying to overfit the model to identify false-negative types 2017-11-10 17:52:21 +05:30
Malar Kannan 1190312def removed tfrecord tensor code and remnants 2017-11-10 14:15:12 +05:30
Malar Kannan e9b18921ee implemented train/test split at word-level and generator returns one-shot validation data 2017-11-10 14:07:31 +05:30
Malar Kannan ab452494b3 implemented streaming tfreccords 2017-11-09 20:31:29 +05:30
Malar Kannan 0a4d4fadeb implemented random sampling of data for oneshot loading 2017-11-09 15:00:17 +05:30
Malar Kannan b3a6aa2f6a clean-up 2017-11-08 11:08:19 +05:30
Malar Kannan 7cbfebbf1a 1. fixed missing wrong pairs
2.using different progress bakend
2017-11-07 17:27:09 +05:30
Malar Kannan b8a9f87031 implemented padding and pipeline is complete 2017-11-07 15:18:04 +05:30
Malar Kannan 41b3f1a9fe dropping invalid csv entries 2017-11-07 12:43:17 +05:30
Malar Kannan 55e2de2f04 using csv writer instead as comma in phrases are mis-aligning columns 2017-11-07 11:56:09 +05:30
Malar Kannan 15f29895d4 implemented tfrecord reader and model refactor wip 2017-11-07 00:10:23 +05:30
9 changed files with 540 additions and 275 deletions

View File

@ -1,25 +0,0 @@
import multiprocessing
import pandas as pd
import numpy as np
def _apply_df(args):
df, func, num, kwargs = args
return num, df.apply(func, **kwargs)
def apply_by_multiprocessing(df,func,**kwargs):
cores = multiprocessing.cpu_count()
workers=kwargs.pop('workers') if 'workers' in kwargs else cores
pool = multiprocessing.Pool(processes=workers)
result = pool.map(_apply_df, [(d, func, i, kwargs) for i,d in enumerate(np.array_split(df, workers))])
pool.close()
result=sorted(result,key=lambda x:x[0])
return pd.concat([i[1] for i in result])
def square(x):
return x**x
if __name__ == '__main__':
df = pd.DataFrame({'a':range(10), 'b':range(10)})
apply_by_multiprocessing(df, square, axis=1, workers=4)

76
requirements-linux.txt Normal file
View File

@ -0,0 +1,76 @@
bleach==1.5.0
click==6.7
cloudpickle==0.4.1
cycler==0.10.0
dask==0.15.4
decorator==4.1.2
distributed==1.19.3
entrypoints==0.2.3
enum34==1.1.6
futures==3.1.1
h5py==2.7.1
HeapDict==1.0.0
html5lib==0.9999999
ipykernel==4.6.1
ipython==6.2.1
ipython-genutils==0.2.0
ipywidgets==7.0.3
jedi==0.11.0
Jinja2==2.9.6
jsonschema==2.6.0
jupyter==1.0.0
jupyter-client==5.1.0
jupyter-console==5.2.0
jupyter-core==4.3.0
Keras==2.0.8
locket==0.2.0
Markdown==2.6.9
MarkupSafe==1.0
matplotlib==2.1.0
mistune==0.7.4
msgpack-python==0.4.8
nbconvert==5.3.1
nbformat==4.4.0
notebook==5.2.0
numexpr==2.6.4
numpy==1.13.3
pandas==0.20.3
pandocfilters==1.4.2
parso==0.1.0
partd==0.3.8
pexpect==4.2.1
pickleshare==0.7.4
progressbar2==3.34.3
prompt-toolkit==1.0.15
protobuf==3.4.0
psutil==5.4.0
ptyprocess==0.5.2
PyAudio==0.2.11
Pygments==2.2.0
pyparsing==2.2.0
pysndfile==1.0.0
python-dateutil==2.6.1
python-utils==2.2.0
pytz==2017.2
PyYAML==3.12
pyzmq==16.0.2
qtconsole==4.3.1
scikit-learn==0.19.0
scipy==0.19.1
simplegeneric==0.8.1
six==1.11.0
sortedcontainers==1.5.7
tables==3.4.2
tblib==1.3.2
tensorflow==1.3.0
tensorflow-tensorboard==0.4.0rc1
terminado==0.6
testpath==0.3.1
toolz==0.8.2
tornado==4.5.2
tqdm==4.19.4
traitlets==4.3.2
wcwidth==0.1.7
Werkzeug==0.12.2
widgetsnbextension==3.0.6
zict==0.1.3

View File

@ -1,220 +1,246 @@
import pandas as pd import pandas as pd
from pandas_parallel import apply_by_multiprocessing from speech_utils import apply_by_multiprocessing
from speech_utils import threadsafe_iter
# import dask as dd # import dask as dd
# import dask.dataframe as ddf # import dask.dataframe as ddf
import tensorflow as tf import tensorflow as tf
from tensorflow.python.ops import data_flow_ops
import numpy as np import numpy as np
from spectro_gen import generate_aiff_spectrogram from speech_spectrum import generate_aiff_spectrogram
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
import itertools import itertools
import os import os
import random import random
import csv import csv
import gc import gc
import pickle
from tqdm import tqdm
def get_siamese_pairs(groupF1, groupF2):
group1 = [r for (i, r) in groupF1.iterrows()]
group2 = [r for (i, r) in groupF2.iterrows()]
diff = [(g1, g2) for g2 in group2 for g1 in group1]
same = [i for i in itertools.combinations(group1, 2)
] + [i for i in itertools.combinations(group2, 2)]
random.shuffle(same)
random.shuffle(diff)
# return (random.sample(same,10), random.sample(diff,10))
return same[:10],diff[:10]
def siamese_pairs(rightGroup, wrongGroup): def siamese_pairs(rightGroup, wrongGroup):
group1 = [r for (i, r) in rightGroup.iterrows()] group1 = [r for (i, r) in rightGroup.iterrows()]
group2 = [r for (i, r) in wrongGroup.iterrows()] group2 = [r for (i, r) in wrongGroup.iterrows()]
rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1] rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]
rightRightPairs = [i for i in itertools.combinations(group1, 2)] rightRightPairs = [i for i in itertools.combinations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)]
random.shuffle(rightWrongPairs) # random.shuffle(rightWrongPairs)
random.shuffle(rightRightPairs) # random.shuffle(rightRightPairs)
# return (random.sample(same,10), random.sample(diff,10)) # return rightRightPairs[:10],rightWrongPairs[:10]
return rightRightPairs[:10],rightWrongPairs[:10] return rightRightPairs[:32],rightWrongPairs[:32]
def append_zeros(spgr, max_samples):
return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], def _float_feature(value):
'median') return tf.train.Feature(float_list=tf.train.FloatList(value=value))
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
def create_spectrogram_tfrecords(audio_group='audio',sample_count=0,train_test_ratio=0.1):
'''
http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/
http://www.machinelearninguru.com/deep_learning/tensorflow/basics/tfrecord/tfrecord.html
'''
audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv',index_col=0)
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
n_records,n_spec,n_features = 0,0,0
def write_samples(wg,sample_name):
word_group_prog = tqdm(wg,desc='Computing spectrogram')
record_file = './outputs/{}.{}.tfrecords'.format(audio_group,sample_name)
writer = tf.python_io.TFRecordWriter(record_file)
for (w, word_group) in word_group_prog:
word_group_prog.set_postfix(word=w,sample_name=sample_name)
g = word_group.reset_index()
g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram)
sample_right = g.loc[g['variant'] == 'low']
sample_wrong = g.loc[g['variant'] == 'medium']
same, diff = siamese_pairs(sample_right, sample_wrong)
groups = [([0,1],same),([1,0],diff)]
for (output,group) in groups:
group_prog = tqdm(group,desc='Writing Spectrogram')
for sample1,sample2 in group_prog:
same = sample1['variant'] == sample2['variant']
phon_same = sample1['phonemes'] == sample2['phonemes']
voice_diff = sample1['voice'] != sample2['voice']
if not same and phon_same:
continue
if same and not voice_diff:
continue
group_prog.set_postfix(output=output
,var1=sample1['variant']
,var2=sample2['variant'])
spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram']
spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0]
spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1]
spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1)
nonlocal n_spec,n_records,n_features
n_spec = max([n_spec,spec_n1,spec_n2])
n_features = spec_w1
n_records+=1
example = tf.train.Example(features=tf.train.Features(
feature={
'word': _bytes_feature([w.encode('utf-8')]),
'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]),
'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]),
'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]),
'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]),
'language': _bytes_feature([sample1['language'].encode('utf-8')]),
'rate1':_int64_feature([sample1['rate']]),
'rate2':_int64_feature([sample2['rate']]),
'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]),
'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]),
'file1': _bytes_feature([sample1['file'].encode('utf-8')]),
'file2': _bytes_feature([sample2['file'].encode('utf-8')]),
'spec1':_float_feature(spec1),
'spec2':_float_feature(spec2),
'spec_n1':_int64_feature([spec_n1]),
'spec_w1':_int64_feature([spec_w1]),
'spec_n2':_int64_feature([spec_n2]),
'spec_w2':_int64_feature([spec_w2]),
'output':_int64_feature(output)
}
))
writer.write(example.SerializeToString())
group_prog.close()
word_group_prog.close()
writer.close()
word_groups = [i for i in audio_samples.groupby('word')]
wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups
tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=train_test_ratio)
write_samples(tr_audio_samples,'train')
write_samples(te_audio_samples,'test')
const_file = os.path.join('./outputs',audio_group+'.constants')
pickle.dump((n_spec,n_features,n_records),open(const_file,'wb'))
def padd_zeros(spgr, max_samples): def padd_zeros(spgr, max_samples):
return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
'constant') 'constant')
def to_onehot(a,class_count=2): def reservoir_sample(iterable, k):
# >>> a = np.array([1, 0, 3]) it = iter(iterable)
a_row_n = a.shape[0] if not (k > 0):
b = np.zeros((a_row_n, class_count)) raise ValueError("sample size must be positive")
b[np.arange(a_row_n), a] = 1
return b
def create_pair(l, r, max_samples): sample = list(itertools.islice(it, k)) # fill the reservoir
l_sample = padd_zeros(l, max_samples) random.shuffle(sample) # if number of items less then *k* then
r_sample = padd_zeros(r, max_samples) # return all items in random order.
return np.asarray([l_sample, r_sample]) for i, item in enumerate(it, start=k+1):
j = random.randrange(i) # random [0..i)
if j < k:
sample[j] = item # replace item with gradually decreasing probability
return sample
def create_test_pair(l, r, max_samples): def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,test_size=0):
l_sample = append_zeros(l, max_samples) records_file = os.path.join('./outputs',audio_group+'.train.tfrecords')
r_sample = append_zeros(r, max_samples) input_pairs = []
return np.asarray([[l_sample, r_sample]]) output_class = []
const_file = os.path.join('./outputs',audio_group+'.constants')
(n_spec,n_features,n_records) = pickle.load(open(const_file,'rb'))
print('reading tfrecords({}-train)...'.format(audio_group))
# @threadsafe_iter
def record_generator():
input_data = []
output_data = []
while True:
record_iterator = tf.python_io.tf_record_iterator(path=records_file)
#tqdm(enumerate(record_iterator),total=n_records)
for (i,string_record) in enumerate(record_iterator):
example = tf.train.Example()
example.ParseFromString(string_record)
spec_n1 = example.features.feature['spec_n1'].int64_list.value[0]
spec_n2 = example.features.feature['spec_n2'].int64_list.value[0]
spec_w1 = example.features.feature['spec_w1'].int64_list.value[0]
spec_w2 = example.features.feature['spec_w2'].int64_list.value[0]
spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1)
spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2)
p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec)
input_data.append(np.asarray([p_spec1,p_spec2]))
output = example.features.feature['output'].int64_list.value
output_data.append(np.asarray(output))
if len(input_data) == batch_size:
input_arr = np.asarray(input_data)
output_arr = np.asarray(output_data)
yield ([input_arr[:, 0], input_arr[:, 1]],output_arr)
input_data = []
output_data = []
def create_X(sp, max_samples): # Read test in one-shot
return create_pair(sp[0]['spectrogram'], sp[1]['spectrogram'], max_samples) te_records_file = os.path.join('./outputs',audio_group+'.test.tfrecords')
te_re_iterator = tf.python_io.tf_record_iterator(path=records_file)
te_n_records = len([i for i in te_re_iterator])
te_re_iterator = tf.python_io.tf_record_iterator(path=records_file)
print('reading tfrecords({}-test)...'.format(audio_group))
test_size = min([test_size,te_n_records]) if test_size > 0 else te_n_records
input_data = np.zeros((test_size,2,n_spec,n_features))
output_data = np.zeros((test_size,2))
random_samples = enumerate(reservoir_sample(te_re_iterator,test_size))
for (i,string_record) in tqdm(random_samples,total=test_size):
example = tf.train.Example()
example.ParseFromString(string_record)
spec_n1 = example.features.feature['spec_n1'].int64_list.value[0]
spec_n2 = example.features.feature['spec_n2'].int64_list.value[0]
spec_w1 = example.features.feature['spec_w1'].int64_list.value[0]
spec_w2 = example.features.feature['spec_w2'].int64_list.value[0]
spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1)
spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2)
p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec)
input_data[i] = np.asarray([p_spec1,p_spec2])
output = example.features.feature['output'].int64_list.value
output_data[i] = np.asarray(output)
return record_generator,input_data,output_data,n_spec,n_features,n_records
# def get_word_pairs_data(word, max_samples): def audio_samples_word_count(audio_group='audio'):
# audio_samples = pd.read_csv( audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv')
# './outputs/audio.csv', return len(audio_samples.groupby(audio_samples['word']))
# names=['word', 'voice', 'rate', 'variant', 'file'])
# audio_samples = audio_samples.loc[audio_samples['word'] ==
# word].reset_index(drop=True)
# audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply(
# lambda x: 'outputs/audio/' + x).apply(generate_aiff_spectrogram)
# max_samples = audio_samples['spectrogram'].apply(
# lambda x: x.shape[0]).max()
# same_data, diff_data = [], []
# for (w, g) in audio_samples.groupby(audio_samples['word']):
# sample_norm = g.loc[audio_samples['variant'] == 'normal']
# sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
# same, diff = get_siamese_pairs(sample_norm, sample_phon)
# same_data.extend([create_X(s, max_samples) for s in same])
# diff_data.extend([create_X(d, max_samples) for d in diff])
# Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
# X = np.asarray(same_data + diff_data)
# # tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
# return (X, Y)
def fix_csv(audio_group='audio'):
def create_spectrogram_data(audio_group='audio'): audio_csv_lines = open('./outputs/' + audio_group + '.csv.orig','r').readlines()
audio_csv_data = [i.strip().split(',') for i in audio_csv_lines]
proper_rows = [i for i in audio_csv_data if len(i) == 7]
with open('./outputs/' + audio_group + '.csv','w') as fixed_csv:
fixed_csv_w = csv.writer(fixed_csv, quoting=csv.QUOTE_MINIMAL)
fixed_csv_w.writerows(proper_rows)
audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv' audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv'
, names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'] , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'])
, quoting=csv.QUOTE_NONE)
# audio_samples = audio_samples.loc[audio_samples['word'] ==
# 'sunflowers'].reset_index(drop=True)
audio_samples['file_paths'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_paths'], os.path.exists)
audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index()
audio_samples['spectrogram'] = apply_by_multiprocessing(audio_samples['file_paths'],generate_aiff_spectrogram)#.apply(
audio_samples['window_count'] = audio_samples.loc[:,'spectrogram'].apply(lambda x: x.shape[0])
audio_samples.to_pickle('outputs/{}-spectrogram.pkl'.format(audio_group))
def create_spectrogram_tfrecords(audio_group='audio'):
audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv'
, names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']
, quoting=csv.QUOTE_NONE)
# audio_samples = audio_samples.loc[audio_samples['word'] ==
# 'sunflowers'].reset_index(drop=True)
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists) audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists)
audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index() audio_samples = audio_samples[audio_samples['file_exists'] == True]
audio_samples = audio_samples.drop(['file_path','file_exists'],axis=1).reset_index(drop=True)
def _float_feature(value): audio_samples.to_csv('./outputs/' + audio_group + '.csv')
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
writer = tf.python_io.TFRecordWriter('./outputs/' + audio_group + '.tfrecords')
# audio_samples = audio_samples[:100]
for (w, word_group) in audio_samples.groupby(audio_samples['word']):
g = word_group.reset_index()
g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram)
sample_right = g.loc[audio_samples['variant'] == 'low']
sample_wrong = g.loc[audio_samples['variant'] == 'medium']
same, diff = siamese_pairs(sample_right, sample_wrong)
groups = [([0,1],same),([1,0],diff)]
for (output,group) in groups:
for sample1,sample2 in group:
spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram']
spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0]
spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1]
spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1)
example = tf.train.Example(features=tf.train.Features(
feature={
'word': _bytes_feature([w.encode('utf-8')]),
'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]),
'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]),
'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]),
'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]),
'language': _bytes_feature([sample1['language'].encode('utf-8')]),
'rate1':_int64_feature([sample1['rate']]),
'rate2':_int64_feature([sample2['rate']]),
'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]),
'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]),
'file1': _bytes_feature([sample1['file'].encode('utf-8')]),
'file2': _bytes_feature([sample2['file'].encode('utf-8')]),
'spec1':_float_feature(spec1),
'spec2':_float_feature(spec2),
'spec_n1':_int64_feature([spec_n1]),
'spec_w1':_int64_feature([spec_w1]),
'spec_n2':_int64_feature([spec_n2]),
'spec_w2':_int64_feature([spec_w2]),
'output':_int64_feature(output)
}
))
writer.write(example.SerializeToString())
writer.close()
def create_tagged_data(audio_samples):
same_data, diff_data = [], []
for (w, g) in audio_samples.groupby(audio_samples['word']):
# sample_norm = g.loc[audio_samples['variant'] == 'low']
# sample_phon = g.loc[audio_samples['variant'] == 'medium']
sample_norm = g.loc[audio_samples['variant'] == 'normal']
sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
same, diff = get_siamese_pairs(sample_norm, sample_phon)
same_data.extend([create_X(s) for s in same])
diff_data.extend([create_X(d) for d in diff])
print('creating all speech pairs')
Y_f = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
Y = to_onehot(Y_f.astype(np.int8))
print('casting as array speech pairs')
X = np.asarray(same_data + diff_data)
return X,Y
def create_speech_pairs_data(audio_group='audio'):
audio_samples = pd.read_pickle('outputs/{}-spectrogram.pkl'.format(audio_group))
# sample_size = audio_samples['spectrogram'][0].shape[1]
tr_audio_samples,te_audio_samples = train_test_split(audio_samples, test_size=0.1)
def save_samples_for(sample_name,samples):
print('generating {} siamese speech pairs'.format(sample_name))
X,Y = create_tagged_data(samples)
print('shuffling array speech pairs')
rng_state = np.random.get_state()
np.random.shuffle(X)
np.random.set_state(rng_state)
np.random.shuffle(Y)
print('pickling X/Y')
np.save('outputs/{}-train-X.npy'.format(audio_group), X)
np.save('outputs/{}-train-Y.npy'.format(audio_group), Y)
save_samples_for('train',tr_audio_samples)
save_samples_for('test',te_audio_samples)
def speech_data(audio_group='audio'):
X = np.load('outputs/{}-X.npy'.format(audio_group)) / 255.0
Y = np.load('outputs/{}-Y.npy'.format(audio_group))
return (X,Y)
def speech_model_data():
tr_pairs = np.load('outputs/tr_pairs.npy') / 255.0
te_pairs = np.load('outputs/te_pairs.npy') / 255.0
tr_pairs[tr_pairs < 0] = 0
te_pairs[te_pairs < 0] = 0
tr_y = np.load('outputs/tr_y.npy')
te_y = np.load('outputs/te_y.npy')
return tr_pairs, te_pairs, tr_y, te_y
def convert_old_audio():
audio_samples = pd.read_csv( './outputs/audio.csv.old'
, names=['word', 'voice', 'rate', 'variant', 'file'])
audio_samples['phonemes'] = 'unknown'
audio_samples['language'] = 'en-US'
audio_samples.loc[audio_samples['variant'] == 'normal','variant'] = 'low'
audio_samples.loc[audio_samples['variant'] == 'phoneme','variant'] = 'medium'
audio_samples = audio_samples[['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']]
audio_samples.to_csv('./outputs/audio_new.csv',index=False,header=False)
if __name__ == '__main__': if __name__ == '__main__':
# sunflower_pairs_data() # sunflower_pairs_data()
# create_spectrogram_data() # create_spectrogram_data()
# create_spectrogram_data('story_words') # create_spectrogram_data('story_words')
create_spectrogram_tfrecords('story_words') # create_spectrogram_tfrecords('story_words')
# create_spectrogram_tfrecords('story_words_test')
# read_siamese_tfrecords('story_all')
# read_siamese_tfrecords('story_words_test')
# padd_zeros_siamese_tfrecords('story_words')
# fix_csv('story_words')
# pickle_constants('story_words')
# create_spectrogram_tfrecords('audio',sample_count=100)
# create_spectrogram_tfrecords('story_all',sample_count=25)
create_spectrogram_tfrecords('story_words',sample_count=10,train_test_ratio=0.2)
# create_spectrogram_tfrecords('audio',sample_count=50)
# read_siamese_tfrecords_generator('audio')
# padd_zeros_siamese_tfrecords('audio')
# create_padded_spectrogram() # create_padded_spectrogram()
# create_speech_pairs_data() # create_speech_pairs_data()
# print(speech_model_data()) # print(speech_model_data())

View File

@ -1,7 +1,8 @@
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import print_function from __future__ import print_function
import numpy as np import numpy as np
from speech_data import speech_model_data # from speech_data import speech_model_data
from speech_data import read_siamese_tfrecords_generator
from keras.models import Model,load_model from keras.models import Model,load_model
from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate
from keras.losses import categorical_crossentropy from keras.losses import categorical_crossentropy
@ -11,41 +12,44 @@ from keras.utils import to_categorical
from keras.optimizers import RMSprop from keras.optimizers import RMSprop
from keras.callbacks import TensorBoard, ModelCheckpoint from keras.callbacks import TensorBoard, ModelCheckpoint
from keras import backend as K from keras import backend as K
from speech_utils import create_dir
# def euclidean_distance(vects):
def euclidean_distance(vects): # x, y = vects
x, y = vects # return K.sqrt(
return K.sqrt( # K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon())) #
#
# def eucl_dist_output_shape(shapes):
def eucl_dist_output_shape(shapes): # shape1, shape2 = shapes
shape1, shape2 = shapes # return (shape1[0], 1)
return (shape1[0], 1) #
#
# def contrastive_loss(y_true, y_pred):
def contrastive_loss(y_true, y_pred): # '''Contrastive loss from Hadsell-et-al.'06
'''Contrastive loss from Hadsell-et-al.'06 # http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf # '''
''' # return K.mean(y_true * K.square(y_pred) +
return K.mean(y_true * K.square(y_pred) + # (1 - y_true) * K.square(K.maximum(1 - y_pred, 0)))
(1 - y_true) * K.square(K.maximum(1 - y_pred, 0)))
def create_base_rnn_network(input_dim): def create_base_rnn_network(input_dim):
'''Base network to be shared (eq. to feature extraction). '''Base network to be shared (eq. to feature extraction).
''' '''
inp = Input(shape=input_dim) inp = Input(shape=input_dim)
ls1 = LSTM(256, return_sequences=True)(inp) ls0 = LSTM(512, return_sequences=True)(inp)
ls1 = LSTM(256, return_sequences=True)(ls0)
ls2 = LSTM(128, return_sequences=True)(ls1) ls2 = LSTM(128, return_sequences=True)(ls1)
# ls3 = LSTM(32, return_sequences=True)(ls2) # ls3 = LSTM(32, return_sequences=True)(ls2)
ls4 = LSTM(64)(ls2) ls4 = LSTM(64)(ls2)
d1 = Dense(128, activation='relu')(ls4)
d2 = Dense(64, activation='relu')(d1)
return Model(inp, ls4) return Model(inp, ls4)
def compute_accuracy(y_true, y_pred): def compute_accuracy(y_true, y_pred):
'''Compute classification accuracy with a fixed threshold on distances. '''Compute classification accuracy with a fixed threshold on distances.
''' '''
pred = y_pred.ravel() < 0.5 pred = y_pred.ravel() > 0.5
return np.mean(pred == y_true) return np.mean(pred == y_true)
@ -56,11 +60,12 @@ def accuracy(y_true, y_pred):
def dense_classifier(processed): def dense_classifier(processed):
conc_proc = Concatenate()(processed) conc_proc = Concatenate()(processed)
d1 = Dense(16, activation='relu')(conc_proc) d1 = Dense(64, activation='relu')(conc_proc)
# dr1 = Dropout(0.1)(d1) # dr1 = Dropout(0.1)(d1)
d2 = Dense(8, activation='relu')(d1) d2 = Dense(128, activation='relu')(d1)
d3 = Dense(8, activation='relu')(d2)
# dr2 = Dropout(0.1)(d2) # dr2 = Dropout(0.1)(d2)
return Dense(2, activation='softmax')(d2) return Dense(2, activation='softmax')(d3)
def siamese_model(input_dim): def siamese_model(input_dim):
# input_dim = (15, 1654) # input_dim = (15, 1654)
@ -78,17 +83,24 @@ def siamese_model(input_dim):
return model return model
def train_siamese(): def train_siamese(audio_group = 'audio'):
# the data, shuffled and split between train and test sets # the data, shuffled and split between train and test sets
tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data() # tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data()
tr_y = to_categorical(tr_y_e, num_classes=2) batch_size = 128
te_y = to_categorical(te_y_e, num_classes=2) model_dir = './models/'+audio_group
input_dim = (tr_pairs.shape[2], tr_pairs.shape[3]) create_dir(model_dir)
log_dir = './logs/'+audio_group
create_dir(log_dir)
tr_gen_fn,te_pairs,te_y,n_step,n_features,n_records = read_siamese_tfrecords_generator(audio_group,batch_size=batch_size)
tr_gen = tr_gen_fn()
# tr_y = to_categorical(tr_y_e, num_classes=2)
# te_y = to_categorical(te_y_e, num_classes=2)
input_dim = (n_step, n_features)
model = siamese_model(input_dim) model = siamese_model(input_dim)
tb_cb = TensorBoard( tb_cb = TensorBoard(
log_dir='./logs/siamese_logs', log_dir=log_dir,
histogram_freq=1, histogram_freq=1,
batch_size=32, batch_size=32,
write_graph=True, write_graph=True,
@ -97,39 +109,45 @@ def train_siamese():
embeddings_freq=0, embeddings_freq=0,
embeddings_layer_names=None, embeddings_layer_names=None,
embeddings_metadata=None) embeddings_metadata=None)
cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_loss:0.2f}\ cp_file_fmt = model_dir+'/siamese_speech_model-{epoch:02d}-epoch-{val_loss:0.2f}\
-acc.h5' -acc.h5'
cp_cb = ModelCheckpoint( cp_cb = ModelCheckpoint(
cp_file_fmt, cp_file_fmt,
monitor='val_loss', monitor='val_loss',
verbose=0, verbose=0,
save_best_only=False, save_best_only=True,
save_weights_only=False, save_weights_only=True,
mode='auto', mode='auto',
period=1) period=1)
# train # train
rms = RMSprop(lr=0.001) rms = RMSprop()#lr=0.001
model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy]) model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy])
model.fit( # model.fit(
[tr_pairs[:, 0], tr_pairs[:, 1]], # [tr_pairs[:, 0], tr_pairs[:, 1]],
tr_y, # tr_y,
batch_size=128, # batch_size=128,
epochs=50, # epochs=100,
validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y), # validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),
callbacks=[tb_cb, cp_cb]) # callbacks=[tb_cb, cp_cb])
model.fit_generator(tr_gen
model.save('./models/siamese_speech_model-final.h5') ,epochs=1000
,steps_per_epoch=n_records//batch_size
,validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y)
,use_multiprocessing=True, workers=1
,callbacks=[tb_cb, cp_cb])
model.save(model_dir+'/siamese_speech_model-final.h5')
# compute final accuracy on training and test sets # compute final accuracy on training and test sets
y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]]) # y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
tr_acc = compute_accuracy(tr_y, y_pred) # tr_acc = compute_accuracy(tr_y, y_pred)
# print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]]) y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
te_acc = compute_accuracy(te_y, y_pred) te_acc = compute_accuracy(te_y, y_pred)
print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc)) print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
if __name__ == '__main__': if __name__ == '__main__':
train_siamese() train_siamese('story_words')
# train_siamese('audio')

View File

@ -1,15 +1,36 @@
import pyaudio import pyaudio
from pysndfile import sndio as snd
import numpy as np import numpy as np
# from matplotlib import pyplot as plt # from matplotlib import pyplot as plt
from spectro_gen import plot_stft, generate_spec_frec from speech_spectrum import plot_stft, generate_spec_frec
SAMPLE_RATE = 22050
N_CHANNELS = 2
def file_player():
p_oup = pyaudio.PyAudio()
def play_file(audiopath,plot=False):
print('playing',audiopath)
samples, samplerate, form = snd.read(audiopath)
stream = p_oup.open(
format=pyaudio.paFloat32,
channels=2,
rate=samplerate,
output=True)
one_channel = np.asarray([samples, samples]).T.reshape(-1)
audio_data = one_channel.astype(np.float32).tobytes()
stream.write(audio_data)
stream.close()
if plot:
plot_stft(samples, SAMPLE_RATE)
def close_player():
p_oup.terminate()
return play_file,close_player
def record_spectrogram(n_sec, plot=False, playback=False): def record_spectrogram(n_sec, plot=False, playback=False):
SAMPLE_RATE = 22050 # show_record_prompt()
N_CHANNELS = 2
N_SEC = n_sec N_SEC = n_sec
CHUNKSIZE = int(SAMPLE_RATE * N_SEC / N_CHANNELS) # fixed chunk size CHUNKSIZE = int(SAMPLE_RATE * N_SEC / N_CHANNELS) # fixed chunk size
# show_record_prompt()
input('Press [Enter] to start recording sample... ') input('Press [Enter] to start recording sample... ')
p_inp = pyaudio.PyAudio() p_inp = pyaudio.PyAudio()
stream = p_inp.open( stream = p_inp.open(

74
speech_utils.py Normal file
View File

@ -0,0 +1,74 @@
import os
import threading
import multiprocessing
import pandas as pd
import numpy as np
def _apply_df(args):
df, func, num, kwargs = args
return num, df.apply(func, **kwargs)
def apply_by_multiprocessing(df,func,**kwargs):
cores = multiprocessing.cpu_count()
workers=kwargs.pop('workers') if 'workers' in kwargs else cores
pool = multiprocessing.Pool(processes=workers)
result = pool.map(_apply_df, [(d, func, i, kwargs) for i,d in enumerate(np.array_split(df, workers))])
pool.close()
result=sorted(result,key=lambda x:x[0])
return pd.concat([i[1] for i in result])
def square(x):
return x**x
if __name__ == '__main__':
df = pd.DataFrame({'a':range(10), 'b':range(10)})
apply_by_multiprocessing(df, square, axis=1, workers=4)
def rm_rf(d):
for path in (os.path.join(d,f) for f in os.listdir(d)):
if os.path.isdir(path):
rm_rf(path)
else:
os.unlink(path)
os.rmdir(d)
def create_dir(direc):
if not os.path.exists(direc):
os.makedirs(direc)
else:
rm_rf(direc)
create_dir(direc)
#################### Now make the data generator threadsafe ####################
class threadsafe_iter:
"""Takes an iterator/generator and makes it thread-safe by
serializing call to the `next` method of given iterator/generator.
"""
def __init__(self, it):
self.it = it
self.lock = threading.Lock()
def __iter__(self):
return self
def __next__(self): # Py3
with self.lock:
return next(self.it)
def next(self): # Py2
with self.lock:
return self.it.next()
def threadsafe_generator(f):
"""A decorator that takes a generator function and makes it thread-safe.
"""
def g(*a, **kw):
return threadsafe_iter(f(*a, **kw))
return g

View File

@ -1,13 +1,15 @@
from speech_siamese import siamese_model # from speech_siamese import siamese_model
from record_mic_speech import record_spectrogram from speech_tools import record_spectrogram, file_player
from importlib import reload # from importlib import reload
# import speech_data # import speech_data
# reload(speech_data) # reload(speech_data)
from speech_data import create_test_pair,get_word_pairs_data,speech_data
import numpy as np import numpy as np
import pandas as pd
model = siamese_model((15, 1654)) import os
model.load_weights('./models/siamese_speech_model-final.h5') import pickle
import tensorflow as tf
import csv
from speech_data import padd_zeros
def predict_recording_with(m,sample_size=15): def predict_recording_with(m,sample_size=15):
spec1 = record_spectrogram(n_sec=1.4) spec1 = record_spectrogram(n_sec=1.4)
@ -24,7 +26,85 @@ def test_with(audio_group):
print(np.argmax(model.predict([X[:, 0], X[:, 1]]),axis=1)) print(np.argmax(model.predict([X[:, 0], X[:, 1]]),axis=1))
print(Y.astype(np.int8)) print(Y.astype(np.int8))
test_with('rand_edu') def evaluate_siamese(audio_group='audio',model_file = 'siamese_speech_model-305-epoch-0.20-acc.h5'):
# audio_group='audio';model_file = 'siamese_speech_model-305-epoch-0.20-acc.h5'
records_file = os.path.join('./outputs',audio_group+'.train.tfrecords')
const_file = os.path.join('./outputs',audio_group+'.constants')
model_weights_path =os.path.join('./models/story_words/',model_file)
(n_spec,n_features,n_records) = pickle.load(open(const_file,'rb'))
print('evaluating tfrecords({}-train)...'.format(audio_group))
model = siamese_model((n_spec, n_features))
model.load_weights(model_weights_path)
record_iterator = tf.python_io.tf_record_iterator(path=records_file)
#tqdm(enumerate(record_iterator),total=n_records)
result_csv = open('./outputs/' + audio_group + '.results.csv','w')
result_csv_w = csv.writer(result_csv, quoting=csv.QUOTE_MINIMAL)
result_csv_w.writerow(["phoneme1","phoneme2","voice1","voice2","rate1","rate2","variant1","variant2","file1","file2"])
for (i,string_record) in enumerate(record_iterator):
# string_record = next(record_iterator)
example = tf.train.Example()
example.ParseFromString(string_record)
spec_n1 = example.features.feature['spec_n1'].int64_list.value[0]
spec_n2 = example.features.feature['spec_n2'].int64_list.value[0]
spec_w1 = example.features.feature['spec_w1'].int64_list.value[0]
spec_w2 = example.features.feature['spec_w2'].int64_list.value[0]
spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1)
spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2)
p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec)
input_arr = np.asarray([[p_spec1,p_spec2]])
output_arr = np.asarray([example.features.feature['output'].int64_list.value])
y_pred = model.predict([input_arr[:, 0], input_arr[:, 1]])
predicted = np.asarray(y_pred[0]>0.5).astype(output_arr.dtype)
expected = output_arr[0]
if np.all(predicted == expected):
continue
word = example.features.feature['word'].bytes_list.value[0].decode()
phoneme1 = example.features.feature['phoneme1'].bytes_list.value[0].decode()
phoneme2 = example.features.feature['phoneme2'].bytes_list.value[0].decode()
voice1 = example.features.feature['voice1'].bytes_list.value[0].decode()
voice2 = example.features.feature['voice2'].bytes_list.value[0].decode()
language = example.features.feature['language'].bytes_list.value[0].decode()
rate1 = example.features.feature['rate1'].int64_list.value[0]
rate2 = example.features.feature['rate2'].int64_list.value[0]
variant1 = example.features.feature['variant1'].bytes_list.value[0].decode()
variant2 = example.features.feature['variant2'].bytes_list.value[0].decode()
file1 = example.features.feature['file1'].bytes_list.value[0].decode()
file2 = example.features.feature['file2'].bytes_list.value[0].decode()
print(phoneme1,phoneme2,voice1,voice2,rate1,rate2,variant1,variant2,file1,file2)
result_csv_w.writerow([phoneme1,phoneme2,voice1,voice2,rate1,rate2,variant1,variant2,file1,file2])
result_csv.close()
def play_results(audio_group='audio'):
result_data = pd.read_csv('./outputs/' + audio_group + '.results.csv')
play_file,close_player = file_player()
quit = False
for (i,r) in result_data.iterrows():
if quit:
break
keys = ["phoneme1","phoneme2","voice1","voice2","rate1","rate2","variant1","variant2"]
row_vals = [str(r[k]) for k in keys]
h_str = '\t'.join(keys)
row_str = '\t'.join(row_vals)
while True:
print(h_str)
print(row_str)
play_file('./outputs/'+audio_group+'/'+r['file1'],True)
play_file('./outputs/'+audio_group+'/'+r['file2'],True)
a = input("press 'r/q/[Enter]' to replay/quit/continue:\t")
if a == 'r':
continue
if a == 'q':
quit = True
break
else:
break
close_player()
# evaluate_siamese('story_words',model_file='siamese_speech_model-305-epoch-0.20-acc.h5')
play_results('story_words')
# test_with('rand_edu')
# sunflower_data,sunflower_result = get_word_pairs_data('sweater',15) # sunflower_data,sunflower_result = get_word_pairs_data('sweater',15)
# print(np.argmax(model.predict([sunflower_data[:, 0], sunflower_data[:, 1]]),axis=1)) # print(np.argmax(model.predict([sunflower_data[:, 0], sunflower_data[:, 1]]),axis=1))
# print(sunflower_result) # print(sunflower_result)

View File

@ -3,6 +3,7 @@ from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty
from AppKit import NSSpeechModePhoneme from AppKit import NSSpeechModePhoneme
from Foundation import NSURL from Foundation import NSURL
import json import json
import csv
import random import random
import os import os
import re import re
@ -81,6 +82,11 @@ class SynthFile(object):
return ','.join([str(c) for c in cols])+'\n' return ','.join([str(c) for c in cols])+'\n'
def get_values(self):
cols = [self.word, self.phoneme, self.voice,
self.voice_lang, self.rate, self.variant,
self.filename]
return [str(c) for c in cols]
class SynthVariant(object): class SynthVariant(object):
"""docstring for SynthVariant.""" """docstring for SynthVariant."""
@ -191,22 +197,11 @@ def synth_generator():
print("It took {} to synthsize all variants.".format(time_str)) print("It took {} to synthsize all variants.".format(time_str))
return synth_for_words return synth_for_words
def write_synths(synth_list, fname, csv=False):
f = open(fname, 'w')
if csv:
for s in synth_list:
f.write(s.get_csv())
else:
json.dump([s.get_json() for s in synth_list], f)
f.close()
def synth_logger(fname, csv=False): def synth_logger(fname, csv=False):
f = open(fname, 'w') f = open(fname, 'w')
s_csv_w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
def csv_writer(s): def csv_writer(s):
f.write(s.get_csv()) s_csv_w.writerow(s.get_values())
synth_list = [] synth_list = []
def json_writer(s): def json_writer(s):