Compare commits
14 Commits
5b682c78b8
...
e4b8b4e0a7
| Author | SHA1 | Date |
|---|---|---|
|
|
e4b8b4e0a7 | |
|
|
988f66c2c2 | |
|
|
d978272bdb | |
|
|
bb72c4045e | |
|
|
1190312def | |
|
|
e9b18921ee | |
|
|
ab452494b3 | |
|
|
0a4d4fadeb | |
|
|
b3a6aa2f6a | |
|
|
7cbfebbf1a | |
|
|
b8a9f87031 | |
|
|
41b3f1a9fe | |
|
|
55e2de2f04 | |
|
|
15f29895d4 |
|
|
@ -1,25 +0,0 @@
|
||||||
import multiprocessing
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _apply_df(args):
|
|
||||||
df, func, num, kwargs = args
|
|
||||||
return num, df.apply(func, **kwargs)
|
|
||||||
|
|
||||||
def apply_by_multiprocessing(df,func,**kwargs):
|
|
||||||
cores = multiprocessing.cpu_count()
|
|
||||||
workers=kwargs.pop('workers') if 'workers' in kwargs else cores
|
|
||||||
pool = multiprocessing.Pool(processes=workers)
|
|
||||||
result = pool.map(_apply_df, [(d, func, i, kwargs) for i,d in enumerate(np.array_split(df, workers))])
|
|
||||||
pool.close()
|
|
||||||
result=sorted(result,key=lambda x:x[0])
|
|
||||||
return pd.concat([i[1] for i in result])
|
|
||||||
|
|
||||||
def square(x):
|
|
||||||
return x**x
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
df = pd.DataFrame({'a':range(10), 'b':range(10)})
|
|
||||||
apply_by_multiprocessing(df, square, axis=1, workers=4)
|
|
||||||
|
|
@ -0,0 +1,76 @@
|
||||||
|
bleach==1.5.0
|
||||||
|
click==6.7
|
||||||
|
cloudpickle==0.4.1
|
||||||
|
cycler==0.10.0
|
||||||
|
dask==0.15.4
|
||||||
|
decorator==4.1.2
|
||||||
|
distributed==1.19.3
|
||||||
|
entrypoints==0.2.3
|
||||||
|
enum34==1.1.6
|
||||||
|
futures==3.1.1
|
||||||
|
h5py==2.7.1
|
||||||
|
HeapDict==1.0.0
|
||||||
|
html5lib==0.9999999
|
||||||
|
ipykernel==4.6.1
|
||||||
|
ipython==6.2.1
|
||||||
|
ipython-genutils==0.2.0
|
||||||
|
ipywidgets==7.0.3
|
||||||
|
jedi==0.11.0
|
||||||
|
Jinja2==2.9.6
|
||||||
|
jsonschema==2.6.0
|
||||||
|
jupyter==1.0.0
|
||||||
|
jupyter-client==5.1.0
|
||||||
|
jupyter-console==5.2.0
|
||||||
|
jupyter-core==4.3.0
|
||||||
|
Keras==2.0.8
|
||||||
|
locket==0.2.0
|
||||||
|
Markdown==2.6.9
|
||||||
|
MarkupSafe==1.0
|
||||||
|
matplotlib==2.1.0
|
||||||
|
mistune==0.7.4
|
||||||
|
msgpack-python==0.4.8
|
||||||
|
nbconvert==5.3.1
|
||||||
|
nbformat==4.4.0
|
||||||
|
notebook==5.2.0
|
||||||
|
numexpr==2.6.4
|
||||||
|
numpy==1.13.3
|
||||||
|
pandas==0.20.3
|
||||||
|
pandocfilters==1.4.2
|
||||||
|
parso==0.1.0
|
||||||
|
partd==0.3.8
|
||||||
|
pexpect==4.2.1
|
||||||
|
pickleshare==0.7.4
|
||||||
|
progressbar2==3.34.3
|
||||||
|
prompt-toolkit==1.0.15
|
||||||
|
protobuf==3.4.0
|
||||||
|
psutil==5.4.0
|
||||||
|
ptyprocess==0.5.2
|
||||||
|
PyAudio==0.2.11
|
||||||
|
Pygments==2.2.0
|
||||||
|
pyparsing==2.2.0
|
||||||
|
pysndfile==1.0.0
|
||||||
|
python-dateutil==2.6.1
|
||||||
|
python-utils==2.2.0
|
||||||
|
pytz==2017.2
|
||||||
|
PyYAML==3.12
|
||||||
|
pyzmq==16.0.2
|
||||||
|
qtconsole==4.3.1
|
||||||
|
scikit-learn==0.19.0
|
||||||
|
scipy==0.19.1
|
||||||
|
simplegeneric==0.8.1
|
||||||
|
six==1.11.0
|
||||||
|
sortedcontainers==1.5.7
|
||||||
|
tables==3.4.2
|
||||||
|
tblib==1.3.2
|
||||||
|
tensorflow==1.3.0
|
||||||
|
tensorflow-tensorboard==0.4.0rc1
|
||||||
|
terminado==0.6
|
||||||
|
testpath==0.3.1
|
||||||
|
toolz==0.8.2
|
||||||
|
tornado==4.5.2
|
||||||
|
tqdm==4.19.4
|
||||||
|
traitlets==4.3.2
|
||||||
|
wcwidth==0.1.7
|
||||||
|
Werkzeug==0.12.2
|
||||||
|
widgetsnbextension==3.0.6
|
||||||
|
zict==0.1.3
|
||||||
380
speech_data.py
380
speech_data.py
|
|
@ -1,220 +1,246 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas_parallel import apply_by_multiprocessing
|
from speech_utils import apply_by_multiprocessing
|
||||||
|
from speech_utils import threadsafe_iter
|
||||||
# import dask as dd
|
# import dask as dd
|
||||||
# import dask.dataframe as ddf
|
# import dask.dataframe as ddf
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
from tensorflow.python.ops import data_flow_ops
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from spectro_gen import generate_aiff_spectrogram
|
from speech_spectrum import generate_aiff_spectrogram
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
import itertools
|
import itertools
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
import csv
|
import csv
|
||||||
import gc
|
import gc
|
||||||
|
import pickle
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
def get_siamese_pairs(groupF1, groupF2):
|
|
||||||
group1 = [r for (i, r) in groupF1.iterrows()]
|
|
||||||
group2 = [r for (i, r) in groupF2.iterrows()]
|
|
||||||
diff = [(g1, g2) for g2 in group2 for g1 in group1]
|
|
||||||
same = [i for i in itertools.combinations(group1, 2)
|
|
||||||
] + [i for i in itertools.combinations(group2, 2)]
|
|
||||||
random.shuffle(same)
|
|
||||||
random.shuffle(diff)
|
|
||||||
# return (random.sample(same,10), random.sample(diff,10))
|
|
||||||
return same[:10],diff[:10]
|
|
||||||
|
|
||||||
def siamese_pairs(rightGroup, wrongGroup):
|
def siamese_pairs(rightGroup, wrongGroup):
|
||||||
group1 = [r for (i, r) in rightGroup.iterrows()]
|
group1 = [r for (i, r) in rightGroup.iterrows()]
|
||||||
group2 = [r for (i, r) in wrongGroup.iterrows()]
|
group2 = [r for (i, r) in wrongGroup.iterrows()]
|
||||||
rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]
|
rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]
|
||||||
rightRightPairs = [i for i in itertools.combinations(group1, 2)]
|
rightRightPairs = [i for i in itertools.combinations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)]
|
||||||
random.shuffle(rightWrongPairs)
|
# random.shuffle(rightWrongPairs)
|
||||||
random.shuffle(rightRightPairs)
|
# random.shuffle(rightRightPairs)
|
||||||
# return (random.sample(same,10), random.sample(diff,10))
|
# return rightRightPairs[:10],rightWrongPairs[:10]
|
||||||
return rightRightPairs[:10],rightWrongPairs[:10]
|
return rightRightPairs[:32],rightWrongPairs[:32]
|
||||||
|
|
||||||
def append_zeros(spgr, max_samples):
|
|
||||||
return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
|
def _float_feature(value):
|
||||||
'median')
|
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
|
||||||
|
|
||||||
|
def _int64_feature(value):
|
||||||
|
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
|
||||||
|
|
||||||
|
def _bytes_feature(value):
|
||||||
|
return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
|
||||||
|
|
||||||
|
def create_spectrogram_tfrecords(audio_group='audio',sample_count=0,train_test_ratio=0.1):
|
||||||
|
'''
|
||||||
|
http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/
|
||||||
|
http://www.machinelearninguru.com/deep_learning/tensorflow/basics/tfrecord/tfrecord.html
|
||||||
|
'''
|
||||||
|
audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv',index_col=0)
|
||||||
|
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
|
||||||
|
n_records,n_spec,n_features = 0,0,0
|
||||||
|
|
||||||
|
def write_samples(wg,sample_name):
|
||||||
|
word_group_prog = tqdm(wg,desc='Computing spectrogram')
|
||||||
|
record_file = './outputs/{}.{}.tfrecords'.format(audio_group,sample_name)
|
||||||
|
writer = tf.python_io.TFRecordWriter(record_file)
|
||||||
|
for (w, word_group) in word_group_prog:
|
||||||
|
word_group_prog.set_postfix(word=w,sample_name=sample_name)
|
||||||
|
g = word_group.reset_index()
|
||||||
|
g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram)
|
||||||
|
sample_right = g.loc[g['variant'] == 'low']
|
||||||
|
sample_wrong = g.loc[g['variant'] == 'medium']
|
||||||
|
same, diff = siamese_pairs(sample_right, sample_wrong)
|
||||||
|
groups = [([0,1],same),([1,0],diff)]
|
||||||
|
for (output,group) in groups:
|
||||||
|
group_prog = tqdm(group,desc='Writing Spectrogram')
|
||||||
|
for sample1,sample2 in group_prog:
|
||||||
|
same = sample1['variant'] == sample2['variant']
|
||||||
|
phon_same = sample1['phonemes'] == sample2['phonemes']
|
||||||
|
voice_diff = sample1['voice'] != sample2['voice']
|
||||||
|
if not same and phon_same:
|
||||||
|
continue
|
||||||
|
if same and not voice_diff:
|
||||||
|
continue
|
||||||
|
group_prog.set_postfix(output=output
|
||||||
|
,var1=sample1['variant']
|
||||||
|
,var2=sample2['variant'])
|
||||||
|
spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram']
|
||||||
|
spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0]
|
||||||
|
spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1]
|
||||||
|
spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1)
|
||||||
|
nonlocal n_spec,n_records,n_features
|
||||||
|
n_spec = max([n_spec,spec_n1,spec_n2])
|
||||||
|
n_features = spec_w1
|
||||||
|
n_records+=1
|
||||||
|
example = tf.train.Example(features=tf.train.Features(
|
||||||
|
feature={
|
||||||
|
'word': _bytes_feature([w.encode('utf-8')]),
|
||||||
|
'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]),
|
||||||
|
'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]),
|
||||||
|
'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]),
|
||||||
|
'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]),
|
||||||
|
'language': _bytes_feature([sample1['language'].encode('utf-8')]),
|
||||||
|
'rate1':_int64_feature([sample1['rate']]),
|
||||||
|
'rate2':_int64_feature([sample2['rate']]),
|
||||||
|
'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]),
|
||||||
|
'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]),
|
||||||
|
'file1': _bytes_feature([sample1['file'].encode('utf-8')]),
|
||||||
|
'file2': _bytes_feature([sample2['file'].encode('utf-8')]),
|
||||||
|
'spec1':_float_feature(spec1),
|
||||||
|
'spec2':_float_feature(spec2),
|
||||||
|
'spec_n1':_int64_feature([spec_n1]),
|
||||||
|
'spec_w1':_int64_feature([spec_w1]),
|
||||||
|
'spec_n2':_int64_feature([spec_n2]),
|
||||||
|
'spec_w2':_int64_feature([spec_w2]),
|
||||||
|
'output':_int64_feature(output)
|
||||||
|
}
|
||||||
|
))
|
||||||
|
writer.write(example.SerializeToString())
|
||||||
|
group_prog.close()
|
||||||
|
word_group_prog.close()
|
||||||
|
writer.close()
|
||||||
|
|
||||||
|
word_groups = [i for i in audio_samples.groupby('word')]
|
||||||
|
wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups
|
||||||
|
tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=train_test_ratio)
|
||||||
|
write_samples(tr_audio_samples,'train')
|
||||||
|
write_samples(te_audio_samples,'test')
|
||||||
|
const_file = os.path.join('./outputs',audio_group+'.constants')
|
||||||
|
pickle.dump((n_spec,n_features,n_records),open(const_file,'wb'))
|
||||||
|
|
||||||
def padd_zeros(spgr, max_samples):
|
def padd_zeros(spgr, max_samples):
|
||||||
return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
|
return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
|
||||||
'constant')
|
'constant')
|
||||||
|
|
||||||
def to_onehot(a,class_count=2):
|
def reservoir_sample(iterable, k):
|
||||||
# >>> a = np.array([1, 0, 3])
|
it = iter(iterable)
|
||||||
a_row_n = a.shape[0]
|
if not (k > 0):
|
||||||
b = np.zeros((a_row_n, class_count))
|
raise ValueError("sample size must be positive")
|
||||||
b[np.arange(a_row_n), a] = 1
|
|
||||||
return b
|
|
||||||
|
|
||||||
def create_pair(l, r, max_samples):
|
sample = list(itertools.islice(it, k)) # fill the reservoir
|
||||||
l_sample = padd_zeros(l, max_samples)
|
random.shuffle(sample) # if number of items less then *k* then
|
||||||
r_sample = padd_zeros(r, max_samples)
|
# return all items in random order.
|
||||||
return np.asarray([l_sample, r_sample])
|
for i, item in enumerate(it, start=k+1):
|
||||||
|
j = random.randrange(i) # random [0..i)
|
||||||
|
if j < k:
|
||||||
|
sample[j] = item # replace item with gradually decreasing probability
|
||||||
|
return sample
|
||||||
|
|
||||||
|
|
||||||
def create_test_pair(l, r, max_samples):
|
def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,test_size=0):
|
||||||
l_sample = append_zeros(l, max_samples)
|
records_file = os.path.join('./outputs',audio_group+'.train.tfrecords')
|
||||||
r_sample = append_zeros(r, max_samples)
|
input_pairs = []
|
||||||
return np.asarray([[l_sample, r_sample]])
|
output_class = []
|
||||||
|
const_file = os.path.join('./outputs',audio_group+'.constants')
|
||||||
|
(n_spec,n_features,n_records) = pickle.load(open(const_file,'rb'))
|
||||||
|
print('reading tfrecords({}-train)...'.format(audio_group))
|
||||||
|
|
||||||
|
# @threadsafe_iter
|
||||||
|
def record_generator():
|
||||||
|
input_data = []
|
||||||
|
output_data = []
|
||||||
|
while True:
|
||||||
|
record_iterator = tf.python_io.tf_record_iterator(path=records_file)
|
||||||
|
#tqdm(enumerate(record_iterator),total=n_records)
|
||||||
|
for (i,string_record) in enumerate(record_iterator):
|
||||||
|
example = tf.train.Example()
|
||||||
|
example.ParseFromString(string_record)
|
||||||
|
spec_n1 = example.features.feature['spec_n1'].int64_list.value[0]
|
||||||
|
spec_n2 = example.features.feature['spec_n2'].int64_list.value[0]
|
||||||
|
spec_w1 = example.features.feature['spec_w1'].int64_list.value[0]
|
||||||
|
spec_w2 = example.features.feature['spec_w2'].int64_list.value[0]
|
||||||
|
spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1)
|
||||||
|
spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2)
|
||||||
|
p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec)
|
||||||
|
input_data.append(np.asarray([p_spec1,p_spec2]))
|
||||||
|
output = example.features.feature['output'].int64_list.value
|
||||||
|
output_data.append(np.asarray(output))
|
||||||
|
if len(input_data) == batch_size:
|
||||||
|
input_arr = np.asarray(input_data)
|
||||||
|
output_arr = np.asarray(output_data)
|
||||||
|
yield ([input_arr[:, 0], input_arr[:, 1]],output_arr)
|
||||||
|
input_data = []
|
||||||
|
output_data = []
|
||||||
|
|
||||||
def create_X(sp, max_samples):
|
# Read test in one-shot
|
||||||
return create_pair(sp[0]['spectrogram'], sp[1]['spectrogram'], max_samples)
|
te_records_file = os.path.join('./outputs',audio_group+'.test.tfrecords')
|
||||||
|
te_re_iterator = tf.python_io.tf_record_iterator(path=records_file)
|
||||||
|
te_n_records = len([i for i in te_re_iterator])
|
||||||
|
te_re_iterator = tf.python_io.tf_record_iterator(path=records_file)
|
||||||
|
print('reading tfrecords({}-test)...'.format(audio_group))
|
||||||
|
test_size = min([test_size,te_n_records]) if test_size > 0 else te_n_records
|
||||||
|
input_data = np.zeros((test_size,2,n_spec,n_features))
|
||||||
|
output_data = np.zeros((test_size,2))
|
||||||
|
random_samples = enumerate(reservoir_sample(te_re_iterator,test_size))
|
||||||
|
for (i,string_record) in tqdm(random_samples,total=test_size):
|
||||||
|
example = tf.train.Example()
|
||||||
|
example.ParseFromString(string_record)
|
||||||
|
spec_n1 = example.features.feature['spec_n1'].int64_list.value[0]
|
||||||
|
spec_n2 = example.features.feature['spec_n2'].int64_list.value[0]
|
||||||
|
spec_w1 = example.features.feature['spec_w1'].int64_list.value[0]
|
||||||
|
spec_w2 = example.features.feature['spec_w2'].int64_list.value[0]
|
||||||
|
spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1)
|
||||||
|
spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2)
|
||||||
|
p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec)
|
||||||
|
input_data[i] = np.asarray([p_spec1,p_spec2])
|
||||||
|
output = example.features.feature['output'].int64_list.value
|
||||||
|
output_data[i] = np.asarray(output)
|
||||||
|
|
||||||
|
return record_generator,input_data,output_data,n_spec,n_features,n_records
|
||||||
|
|
||||||
# def get_word_pairs_data(word, max_samples):
|
def audio_samples_word_count(audio_group='audio'):
|
||||||
# audio_samples = pd.read_csv(
|
audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv')
|
||||||
# './outputs/audio.csv',
|
return len(audio_samples.groupby(audio_samples['word']))
|
||||||
# names=['word', 'voice', 'rate', 'variant', 'file'])
|
|
||||||
# audio_samples = audio_samples.loc[audio_samples['word'] ==
|
|
||||||
# word].reset_index(drop=True)
|
|
||||||
# audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply(
|
|
||||||
# lambda x: 'outputs/audio/' + x).apply(generate_aiff_spectrogram)
|
|
||||||
# max_samples = audio_samples['spectrogram'].apply(
|
|
||||||
# lambda x: x.shape[0]).max()
|
|
||||||
# same_data, diff_data = [], []
|
|
||||||
# for (w, g) in audio_samples.groupby(audio_samples['word']):
|
|
||||||
# sample_norm = g.loc[audio_samples['variant'] == 'normal']
|
|
||||||
# sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
|
|
||||||
# same, diff = get_siamese_pairs(sample_norm, sample_phon)
|
|
||||||
# same_data.extend([create_X(s, max_samples) for s in same])
|
|
||||||
# diff_data.extend([create_X(d, max_samples) for d in diff])
|
|
||||||
# Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
|
|
||||||
# X = np.asarray(same_data + diff_data)
|
|
||||||
# # tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
|
|
||||||
# return (X, Y)
|
|
||||||
|
|
||||||
|
def fix_csv(audio_group='audio'):
|
||||||
def create_spectrogram_data(audio_group='audio'):
|
audio_csv_lines = open('./outputs/' + audio_group + '.csv.orig','r').readlines()
|
||||||
|
audio_csv_data = [i.strip().split(',') for i in audio_csv_lines]
|
||||||
|
proper_rows = [i for i in audio_csv_data if len(i) == 7]
|
||||||
|
with open('./outputs/' + audio_group + '.csv','w') as fixed_csv:
|
||||||
|
fixed_csv_w = csv.writer(fixed_csv, quoting=csv.QUOTE_MINIMAL)
|
||||||
|
fixed_csv_w.writerows(proper_rows)
|
||||||
audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv'
|
audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv'
|
||||||
, names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']
|
, names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'])
|
||||||
, quoting=csv.QUOTE_NONE)
|
|
||||||
# audio_samples = audio_samples.loc[audio_samples['word'] ==
|
|
||||||
# 'sunflowers'].reset_index(drop=True)
|
|
||||||
audio_samples['file_paths'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
|
|
||||||
audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_paths'], os.path.exists)
|
|
||||||
audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index()
|
|
||||||
audio_samples['spectrogram'] = apply_by_multiprocessing(audio_samples['file_paths'],generate_aiff_spectrogram)#.apply(
|
|
||||||
audio_samples['window_count'] = audio_samples.loc[:,'spectrogram'].apply(lambda x: x.shape[0])
|
|
||||||
audio_samples.to_pickle('outputs/{}-spectrogram.pkl'.format(audio_group))
|
|
||||||
|
|
||||||
def create_spectrogram_tfrecords(audio_group='audio'):
|
|
||||||
audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv'
|
|
||||||
, names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']
|
|
||||||
, quoting=csv.QUOTE_NONE)
|
|
||||||
# audio_samples = audio_samples.loc[audio_samples['word'] ==
|
|
||||||
# 'sunflowers'].reset_index(drop=True)
|
|
||||||
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
|
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
|
||||||
audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists)
|
audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists)
|
||||||
audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index()
|
audio_samples = audio_samples[audio_samples['file_exists'] == True]
|
||||||
|
audio_samples = audio_samples.drop(['file_path','file_exists'],axis=1).reset_index(drop=True)
|
||||||
def _float_feature(value):
|
audio_samples.to_csv('./outputs/' + audio_group + '.csv')
|
||||||
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
|
|
||||||
|
|
||||||
def _int64_feature(value):
|
|
||||||
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
|
|
||||||
|
|
||||||
def _bytes_feature(value):
|
|
||||||
return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
|
|
||||||
|
|
||||||
writer = tf.python_io.TFRecordWriter('./outputs/' + audio_group + '.tfrecords')
|
|
||||||
# audio_samples = audio_samples[:100]
|
|
||||||
for (w, word_group) in audio_samples.groupby(audio_samples['word']):
|
|
||||||
g = word_group.reset_index()
|
|
||||||
g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram)
|
|
||||||
sample_right = g.loc[audio_samples['variant'] == 'low']
|
|
||||||
sample_wrong = g.loc[audio_samples['variant'] == 'medium']
|
|
||||||
same, diff = siamese_pairs(sample_right, sample_wrong)
|
|
||||||
groups = [([0,1],same),([1,0],diff)]
|
|
||||||
for (output,group) in groups:
|
|
||||||
for sample1,sample2 in group:
|
|
||||||
spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram']
|
|
||||||
spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0]
|
|
||||||
spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1]
|
|
||||||
spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1)
|
|
||||||
example = tf.train.Example(features=tf.train.Features(
|
|
||||||
feature={
|
|
||||||
'word': _bytes_feature([w.encode('utf-8')]),
|
|
||||||
'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]),
|
|
||||||
'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]),
|
|
||||||
'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]),
|
|
||||||
'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]),
|
|
||||||
'language': _bytes_feature([sample1['language'].encode('utf-8')]),
|
|
||||||
'rate1':_int64_feature([sample1['rate']]),
|
|
||||||
'rate2':_int64_feature([sample2['rate']]),
|
|
||||||
'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]),
|
|
||||||
'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]),
|
|
||||||
'file1': _bytes_feature([sample1['file'].encode('utf-8')]),
|
|
||||||
'file2': _bytes_feature([sample2['file'].encode('utf-8')]),
|
|
||||||
'spec1':_float_feature(spec1),
|
|
||||||
'spec2':_float_feature(spec2),
|
|
||||||
'spec_n1':_int64_feature([spec_n1]),
|
|
||||||
'spec_w1':_int64_feature([spec_w1]),
|
|
||||||
'spec_n2':_int64_feature([spec_n2]),
|
|
||||||
'spec_w2':_int64_feature([spec_w2]),
|
|
||||||
'output':_int64_feature(output)
|
|
||||||
}
|
|
||||||
))
|
|
||||||
writer.write(example.SerializeToString())
|
|
||||||
writer.close()
|
|
||||||
|
|
||||||
def create_tagged_data(audio_samples):
|
|
||||||
same_data, diff_data = [], []
|
|
||||||
for (w, g) in audio_samples.groupby(audio_samples['word']):
|
|
||||||
# sample_norm = g.loc[audio_samples['variant'] == 'low']
|
|
||||||
# sample_phon = g.loc[audio_samples['variant'] == 'medium']
|
|
||||||
sample_norm = g.loc[audio_samples['variant'] == 'normal']
|
|
||||||
sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
|
|
||||||
same, diff = get_siamese_pairs(sample_norm, sample_phon)
|
|
||||||
same_data.extend([create_X(s) for s in same])
|
|
||||||
diff_data.extend([create_X(d) for d in diff])
|
|
||||||
print('creating all speech pairs')
|
|
||||||
Y_f = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
|
|
||||||
Y = to_onehot(Y_f.astype(np.int8))
|
|
||||||
print('casting as array speech pairs')
|
|
||||||
X = np.asarray(same_data + diff_data)
|
|
||||||
return X,Y
|
|
||||||
|
|
||||||
def create_speech_pairs_data(audio_group='audio'):
|
|
||||||
audio_samples = pd.read_pickle('outputs/{}-spectrogram.pkl'.format(audio_group))
|
|
||||||
# sample_size = audio_samples['spectrogram'][0].shape[1]
|
|
||||||
tr_audio_samples,te_audio_samples = train_test_split(audio_samples, test_size=0.1)
|
|
||||||
def save_samples_for(sample_name,samples):
|
|
||||||
print('generating {} siamese speech pairs'.format(sample_name))
|
|
||||||
X,Y = create_tagged_data(samples)
|
|
||||||
print('shuffling array speech pairs')
|
|
||||||
rng_state = np.random.get_state()
|
|
||||||
np.random.shuffle(X)
|
|
||||||
np.random.set_state(rng_state)
|
|
||||||
np.random.shuffle(Y)
|
|
||||||
print('pickling X/Y')
|
|
||||||
np.save('outputs/{}-train-X.npy'.format(audio_group), X)
|
|
||||||
np.save('outputs/{}-train-Y.npy'.format(audio_group), Y)
|
|
||||||
save_samples_for('train',tr_audio_samples)
|
|
||||||
save_samples_for('test',te_audio_samples)
|
|
||||||
|
|
||||||
def speech_data(audio_group='audio'):
|
|
||||||
X = np.load('outputs/{}-X.npy'.format(audio_group)) / 255.0
|
|
||||||
Y = np.load('outputs/{}-Y.npy'.format(audio_group))
|
|
||||||
return (X,Y)
|
|
||||||
|
|
||||||
def speech_model_data():
|
|
||||||
tr_pairs = np.load('outputs/tr_pairs.npy') / 255.0
|
|
||||||
te_pairs = np.load('outputs/te_pairs.npy') / 255.0
|
|
||||||
tr_pairs[tr_pairs < 0] = 0
|
|
||||||
te_pairs[te_pairs < 0] = 0
|
|
||||||
tr_y = np.load('outputs/tr_y.npy')
|
|
||||||
te_y = np.load('outputs/te_y.npy')
|
|
||||||
return tr_pairs, te_pairs, tr_y, te_y
|
|
||||||
|
|
||||||
|
def convert_old_audio():
|
||||||
|
audio_samples = pd.read_csv( './outputs/audio.csv.old'
|
||||||
|
, names=['word', 'voice', 'rate', 'variant', 'file'])
|
||||||
|
audio_samples['phonemes'] = 'unknown'
|
||||||
|
audio_samples['language'] = 'en-US'
|
||||||
|
audio_samples.loc[audio_samples['variant'] == 'normal','variant'] = 'low'
|
||||||
|
audio_samples.loc[audio_samples['variant'] == 'phoneme','variant'] = 'medium'
|
||||||
|
audio_samples = audio_samples[['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']]
|
||||||
|
audio_samples.to_csv('./outputs/audio_new.csv',index=False,header=False)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# sunflower_pairs_data()
|
# sunflower_pairs_data()
|
||||||
# create_spectrogram_data()
|
# create_spectrogram_data()
|
||||||
# create_spectrogram_data('story_words')
|
# create_spectrogram_data('story_words')
|
||||||
create_spectrogram_tfrecords('story_words')
|
# create_spectrogram_tfrecords('story_words')
|
||||||
|
# create_spectrogram_tfrecords('story_words_test')
|
||||||
|
# read_siamese_tfrecords('story_all')
|
||||||
|
# read_siamese_tfrecords('story_words_test')
|
||||||
|
# padd_zeros_siamese_tfrecords('story_words')
|
||||||
|
# fix_csv('story_words')
|
||||||
|
# pickle_constants('story_words')
|
||||||
|
# create_spectrogram_tfrecords('audio',sample_count=100)
|
||||||
|
# create_spectrogram_tfrecords('story_all',sample_count=25)
|
||||||
|
create_spectrogram_tfrecords('story_words',sample_count=10,train_test_ratio=0.2)
|
||||||
|
# create_spectrogram_tfrecords('audio',sample_count=50)
|
||||||
|
# read_siamese_tfrecords_generator('audio')
|
||||||
|
# padd_zeros_siamese_tfrecords('audio')
|
||||||
# create_padded_spectrogram()
|
# create_padded_spectrogram()
|
||||||
# create_speech_pairs_data()
|
# create_speech_pairs_data()
|
||||||
# print(speech_model_data())
|
# print(speech_model_data())
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,8 @@
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from speech_data import speech_model_data
|
# from speech_data import speech_model_data
|
||||||
|
from speech_data import read_siamese_tfrecords_generator
|
||||||
from keras.models import Model,load_model
|
from keras.models import Model,load_model
|
||||||
from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate
|
from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate
|
||||||
from keras.losses import categorical_crossentropy
|
from keras.losses import categorical_crossentropy
|
||||||
|
|
@ -11,41 +12,44 @@ from keras.utils import to_categorical
|
||||||
from keras.optimizers import RMSprop
|
from keras.optimizers import RMSprop
|
||||||
from keras.callbacks import TensorBoard, ModelCheckpoint
|
from keras.callbacks import TensorBoard, ModelCheckpoint
|
||||||
from keras import backend as K
|
from keras import backend as K
|
||||||
|
from speech_utils import create_dir
|
||||||
|
|
||||||
|
# def euclidean_distance(vects):
|
||||||
def euclidean_distance(vects):
|
# x, y = vects
|
||||||
x, y = vects
|
# return K.sqrt(
|
||||||
return K.sqrt(
|
# K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
|
||||||
K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
|
#
|
||||||
|
#
|
||||||
|
# def eucl_dist_output_shape(shapes):
|
||||||
def eucl_dist_output_shape(shapes):
|
# shape1, shape2 = shapes
|
||||||
shape1, shape2 = shapes
|
# return (shape1[0], 1)
|
||||||
return (shape1[0], 1)
|
#
|
||||||
|
#
|
||||||
|
# def contrastive_loss(y_true, y_pred):
|
||||||
def contrastive_loss(y_true, y_pred):
|
# '''Contrastive loss from Hadsell-et-al.'06
|
||||||
'''Contrastive loss from Hadsell-et-al.'06
|
# http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
|
||||||
http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
|
# '''
|
||||||
'''
|
# return K.mean(y_true * K.square(y_pred) +
|
||||||
return K.mean(y_true * K.square(y_pred) +
|
# (1 - y_true) * K.square(K.maximum(1 - y_pred, 0)))
|
||||||
(1 - y_true) * K.square(K.maximum(1 - y_pred, 0)))
|
|
||||||
|
|
||||||
def create_base_rnn_network(input_dim):
|
def create_base_rnn_network(input_dim):
|
||||||
'''Base network to be shared (eq. to feature extraction).
|
'''Base network to be shared (eq. to feature extraction).
|
||||||
'''
|
'''
|
||||||
inp = Input(shape=input_dim)
|
inp = Input(shape=input_dim)
|
||||||
ls1 = LSTM(256, return_sequences=True)(inp)
|
ls0 = LSTM(512, return_sequences=True)(inp)
|
||||||
|
ls1 = LSTM(256, return_sequences=True)(ls0)
|
||||||
ls2 = LSTM(128, return_sequences=True)(ls1)
|
ls2 = LSTM(128, return_sequences=True)(ls1)
|
||||||
# ls3 = LSTM(32, return_sequences=True)(ls2)
|
# ls3 = LSTM(32, return_sequences=True)(ls2)
|
||||||
ls4 = LSTM(64)(ls2)
|
ls4 = LSTM(64)(ls2)
|
||||||
|
d1 = Dense(128, activation='relu')(ls4)
|
||||||
|
d2 = Dense(64, activation='relu')(d1)
|
||||||
return Model(inp, ls4)
|
return Model(inp, ls4)
|
||||||
|
|
||||||
|
|
||||||
def compute_accuracy(y_true, y_pred):
|
def compute_accuracy(y_true, y_pred):
|
||||||
'''Compute classification accuracy with a fixed threshold on distances.
|
'''Compute classification accuracy with a fixed threshold on distances.
|
||||||
'''
|
'''
|
||||||
pred = y_pred.ravel() < 0.5
|
pred = y_pred.ravel() > 0.5
|
||||||
return np.mean(pred == y_true)
|
return np.mean(pred == y_true)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -56,11 +60,12 @@ def accuracy(y_true, y_pred):
|
||||||
|
|
||||||
def dense_classifier(processed):
|
def dense_classifier(processed):
|
||||||
conc_proc = Concatenate()(processed)
|
conc_proc = Concatenate()(processed)
|
||||||
d1 = Dense(16, activation='relu')(conc_proc)
|
d1 = Dense(64, activation='relu')(conc_proc)
|
||||||
# dr1 = Dropout(0.1)(d1)
|
# dr1 = Dropout(0.1)(d1)
|
||||||
d2 = Dense(8, activation='relu')(d1)
|
d2 = Dense(128, activation='relu')(d1)
|
||||||
|
d3 = Dense(8, activation='relu')(d2)
|
||||||
# dr2 = Dropout(0.1)(d2)
|
# dr2 = Dropout(0.1)(d2)
|
||||||
return Dense(2, activation='softmax')(d2)
|
return Dense(2, activation='softmax')(d3)
|
||||||
|
|
||||||
def siamese_model(input_dim):
|
def siamese_model(input_dim):
|
||||||
# input_dim = (15, 1654)
|
# input_dim = (15, 1654)
|
||||||
|
|
@ -78,17 +83,24 @@ def siamese_model(input_dim):
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def train_siamese():
|
def train_siamese(audio_group = 'audio'):
|
||||||
# the data, shuffled and split between train and test sets
|
# the data, shuffled and split between train and test sets
|
||||||
tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data()
|
# tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data()
|
||||||
tr_y = to_categorical(tr_y_e, num_classes=2)
|
batch_size = 128
|
||||||
te_y = to_categorical(te_y_e, num_classes=2)
|
model_dir = './models/'+audio_group
|
||||||
input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])
|
create_dir(model_dir)
|
||||||
|
log_dir = './logs/'+audio_group
|
||||||
|
create_dir(log_dir)
|
||||||
|
tr_gen_fn,te_pairs,te_y,n_step,n_features,n_records = read_siamese_tfrecords_generator(audio_group,batch_size=batch_size)
|
||||||
|
tr_gen = tr_gen_fn()
|
||||||
|
# tr_y = to_categorical(tr_y_e, num_classes=2)
|
||||||
|
# te_y = to_categorical(te_y_e, num_classes=2)
|
||||||
|
input_dim = (n_step, n_features)
|
||||||
|
|
||||||
model = siamese_model(input_dim)
|
model = siamese_model(input_dim)
|
||||||
|
|
||||||
tb_cb = TensorBoard(
|
tb_cb = TensorBoard(
|
||||||
log_dir='./logs/siamese_logs',
|
log_dir=log_dir,
|
||||||
histogram_freq=1,
|
histogram_freq=1,
|
||||||
batch_size=32,
|
batch_size=32,
|
||||||
write_graph=True,
|
write_graph=True,
|
||||||
|
|
@ -97,39 +109,45 @@ def train_siamese():
|
||||||
embeddings_freq=0,
|
embeddings_freq=0,
|
||||||
embeddings_layer_names=None,
|
embeddings_layer_names=None,
|
||||||
embeddings_metadata=None)
|
embeddings_metadata=None)
|
||||||
cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_loss:0.2f}\
|
cp_file_fmt = model_dir+'/siamese_speech_model-{epoch:02d}-epoch-{val_loss:0.2f}\
|
||||||
-acc.h5'
|
-acc.h5'
|
||||||
|
|
||||||
cp_cb = ModelCheckpoint(
|
cp_cb = ModelCheckpoint(
|
||||||
cp_file_fmt,
|
cp_file_fmt,
|
||||||
monitor='val_loss',
|
monitor='val_loss',
|
||||||
verbose=0,
|
verbose=0,
|
||||||
save_best_only=False,
|
save_best_only=True,
|
||||||
save_weights_only=False,
|
save_weights_only=True,
|
||||||
mode='auto',
|
mode='auto',
|
||||||
period=1)
|
period=1)
|
||||||
# train
|
# train
|
||||||
rms = RMSprop(lr=0.001)
|
rms = RMSprop()#lr=0.001
|
||||||
model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy])
|
model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy])
|
||||||
model.fit(
|
# model.fit(
|
||||||
[tr_pairs[:, 0], tr_pairs[:, 1]],
|
# [tr_pairs[:, 0], tr_pairs[:, 1]],
|
||||||
tr_y,
|
# tr_y,
|
||||||
batch_size=128,
|
# batch_size=128,
|
||||||
epochs=50,
|
# epochs=100,
|
||||||
validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),
|
# validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),
|
||||||
callbacks=[tb_cb, cp_cb])
|
# callbacks=[tb_cb, cp_cb])
|
||||||
|
model.fit_generator(tr_gen
|
||||||
model.save('./models/siamese_speech_model-final.h5')
|
,epochs=1000
|
||||||
|
,steps_per_epoch=n_records//batch_size
|
||||||
|
,validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y)
|
||||||
|
,use_multiprocessing=True, workers=1
|
||||||
|
,callbacks=[tb_cb, cp_cb])
|
||||||
|
model.save(model_dir+'/siamese_speech_model-final.h5')
|
||||||
# compute final accuracy on training and test sets
|
# compute final accuracy on training and test sets
|
||||||
y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
|
# y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
|
||||||
tr_acc = compute_accuracy(tr_y, y_pred)
|
# tr_acc = compute_accuracy(tr_y, y_pred)
|
||||||
|
# print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
|
||||||
|
|
||||||
y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
|
y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
|
||||||
te_acc = compute_accuracy(te_y, y_pred)
|
te_acc = compute_accuracy(te_y, y_pred)
|
||||||
|
|
||||||
print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
|
|
||||||
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
|
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
train_siamese()
|
train_siamese('story_words')
|
||||||
|
# train_siamese('audio')
|
||||||
|
|
|
||||||
|
|
@ -1,15 +1,36 @@
|
||||||
import pyaudio
|
import pyaudio
|
||||||
|
from pysndfile import sndio as snd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
# from matplotlib import pyplot as plt
|
# from matplotlib import pyplot as plt
|
||||||
from spectro_gen import plot_stft, generate_spec_frec
|
from speech_spectrum import plot_stft, generate_spec_frec
|
||||||
|
|
||||||
|
SAMPLE_RATE = 22050
|
||||||
|
N_CHANNELS = 2
|
||||||
|
|
||||||
|
def file_player():
|
||||||
|
p_oup = pyaudio.PyAudio()
|
||||||
|
def play_file(audiopath,plot=False):
|
||||||
|
print('playing',audiopath)
|
||||||
|
samples, samplerate, form = snd.read(audiopath)
|
||||||
|
stream = p_oup.open(
|
||||||
|
format=pyaudio.paFloat32,
|
||||||
|
channels=2,
|
||||||
|
rate=samplerate,
|
||||||
|
output=True)
|
||||||
|
one_channel = np.asarray([samples, samples]).T.reshape(-1)
|
||||||
|
audio_data = one_channel.astype(np.float32).tobytes()
|
||||||
|
stream.write(audio_data)
|
||||||
|
stream.close()
|
||||||
|
if plot:
|
||||||
|
plot_stft(samples, SAMPLE_RATE)
|
||||||
|
def close_player():
|
||||||
|
p_oup.terminate()
|
||||||
|
return play_file,close_player
|
||||||
|
|
||||||
def record_spectrogram(n_sec, plot=False, playback=False):
|
def record_spectrogram(n_sec, plot=False, playback=False):
|
||||||
SAMPLE_RATE = 22050
|
# show_record_prompt()
|
||||||
N_CHANNELS = 2
|
|
||||||
N_SEC = n_sec
|
N_SEC = n_sec
|
||||||
CHUNKSIZE = int(SAMPLE_RATE * N_SEC / N_CHANNELS) # fixed chunk size
|
CHUNKSIZE = int(SAMPLE_RATE * N_SEC / N_CHANNELS) # fixed chunk size
|
||||||
# show_record_prompt()
|
|
||||||
input('Press [Enter] to start recording sample... ')
|
input('Press [Enter] to start recording sample... ')
|
||||||
p_inp = pyaudio.PyAudio()
|
p_inp = pyaudio.PyAudio()
|
||||||
stream = p_inp.open(
|
stream = p_inp.open(
|
||||||
|
|
@ -0,0 +1,74 @@
|
||||||
|
import os
|
||||||
|
import threading
|
||||||
|
|
||||||
|
import multiprocessing
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_df(args):
|
||||||
|
df, func, num, kwargs = args
|
||||||
|
return num, df.apply(func, **kwargs)
|
||||||
|
|
||||||
|
def apply_by_multiprocessing(df,func,**kwargs):
|
||||||
|
cores = multiprocessing.cpu_count()
|
||||||
|
workers=kwargs.pop('workers') if 'workers' in kwargs else cores
|
||||||
|
pool = multiprocessing.Pool(processes=workers)
|
||||||
|
result = pool.map(_apply_df, [(d, func, i, kwargs) for i,d in enumerate(np.array_split(df, workers))])
|
||||||
|
pool.close()
|
||||||
|
result=sorted(result,key=lambda x:x[0])
|
||||||
|
return pd.concat([i[1] for i in result])
|
||||||
|
|
||||||
|
def square(x):
|
||||||
|
return x**x
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
df = pd.DataFrame({'a':range(10), 'b':range(10)})
|
||||||
|
apply_by_multiprocessing(df, square, axis=1, workers=4)
|
||||||
|
|
||||||
|
|
||||||
|
def rm_rf(d):
|
||||||
|
for path in (os.path.join(d,f) for f in os.listdir(d)):
|
||||||
|
if os.path.isdir(path):
|
||||||
|
rm_rf(path)
|
||||||
|
else:
|
||||||
|
os.unlink(path)
|
||||||
|
os.rmdir(d)
|
||||||
|
|
||||||
|
def create_dir(direc):
|
||||||
|
if not os.path.exists(direc):
|
||||||
|
os.makedirs(direc)
|
||||||
|
else:
|
||||||
|
rm_rf(direc)
|
||||||
|
create_dir(direc)
|
||||||
|
|
||||||
|
|
||||||
|
#################### Now make the data generator threadsafe ####################
|
||||||
|
|
||||||
|
class threadsafe_iter:
|
||||||
|
"""Takes an iterator/generator and makes it thread-safe by
|
||||||
|
serializing call to the `next` method of given iterator/generator.
|
||||||
|
"""
|
||||||
|
def __init__(self, it):
|
||||||
|
self.it = it
|
||||||
|
self.lock = threading.Lock()
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __next__(self): # Py3
|
||||||
|
with self.lock:
|
||||||
|
return next(self.it)
|
||||||
|
|
||||||
|
def next(self): # Py2
|
||||||
|
with self.lock:
|
||||||
|
return self.it.next()
|
||||||
|
|
||||||
|
|
||||||
|
def threadsafe_generator(f):
|
||||||
|
"""A decorator that takes a generator function and makes it thread-safe.
|
||||||
|
"""
|
||||||
|
def g(*a, **kw):
|
||||||
|
return threadsafe_iter(f(*a, **kw))
|
||||||
|
return g
|
||||||
|
|
@ -1,13 +1,15 @@
|
||||||
from speech_siamese import siamese_model
|
# from speech_siamese import siamese_model
|
||||||
from record_mic_speech import record_spectrogram
|
from speech_tools import record_spectrogram, file_player
|
||||||
from importlib import reload
|
# from importlib import reload
|
||||||
# import speech_data
|
# import speech_data
|
||||||
# reload(speech_data)
|
# reload(speech_data)
|
||||||
from speech_data import create_test_pair,get_word_pairs_data,speech_data
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
model = siamese_model((15, 1654))
|
import os
|
||||||
model.load_weights('./models/siamese_speech_model-final.h5')
|
import pickle
|
||||||
|
import tensorflow as tf
|
||||||
|
import csv
|
||||||
|
from speech_data import padd_zeros
|
||||||
|
|
||||||
def predict_recording_with(m,sample_size=15):
|
def predict_recording_with(m,sample_size=15):
|
||||||
spec1 = record_spectrogram(n_sec=1.4)
|
spec1 = record_spectrogram(n_sec=1.4)
|
||||||
|
|
@ -24,7 +26,85 @@ def test_with(audio_group):
|
||||||
print(np.argmax(model.predict([X[:, 0], X[:, 1]]),axis=1))
|
print(np.argmax(model.predict([X[:, 0], X[:, 1]]),axis=1))
|
||||||
print(Y.astype(np.int8))
|
print(Y.astype(np.int8))
|
||||||
|
|
||||||
test_with('rand_edu')
|
def evaluate_siamese(audio_group='audio',model_file = 'siamese_speech_model-305-epoch-0.20-acc.h5'):
|
||||||
|
# audio_group='audio';model_file = 'siamese_speech_model-305-epoch-0.20-acc.h5'
|
||||||
|
records_file = os.path.join('./outputs',audio_group+'.train.tfrecords')
|
||||||
|
const_file = os.path.join('./outputs',audio_group+'.constants')
|
||||||
|
model_weights_path =os.path.join('./models/story_words/',model_file)
|
||||||
|
(n_spec,n_features,n_records) = pickle.load(open(const_file,'rb'))
|
||||||
|
print('evaluating tfrecords({}-train)...'.format(audio_group))
|
||||||
|
|
||||||
|
model = siamese_model((n_spec, n_features))
|
||||||
|
model.load_weights(model_weights_path)
|
||||||
|
record_iterator = tf.python_io.tf_record_iterator(path=records_file)
|
||||||
|
#tqdm(enumerate(record_iterator),total=n_records)
|
||||||
|
result_csv = open('./outputs/' + audio_group + '.results.csv','w')
|
||||||
|
result_csv_w = csv.writer(result_csv, quoting=csv.QUOTE_MINIMAL)
|
||||||
|
result_csv_w.writerow(["phoneme1","phoneme2","voice1","voice2","rate1","rate2","variant1","variant2","file1","file2"])
|
||||||
|
for (i,string_record) in enumerate(record_iterator):
|
||||||
|
# string_record = next(record_iterator)
|
||||||
|
example = tf.train.Example()
|
||||||
|
example.ParseFromString(string_record)
|
||||||
|
spec_n1 = example.features.feature['spec_n1'].int64_list.value[0]
|
||||||
|
spec_n2 = example.features.feature['spec_n2'].int64_list.value[0]
|
||||||
|
spec_w1 = example.features.feature['spec_w1'].int64_list.value[0]
|
||||||
|
spec_w2 = example.features.feature['spec_w2'].int64_list.value[0]
|
||||||
|
spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1)
|
||||||
|
spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2)
|
||||||
|
p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec)
|
||||||
|
input_arr = np.asarray([[p_spec1,p_spec2]])
|
||||||
|
output_arr = np.asarray([example.features.feature['output'].int64_list.value])
|
||||||
|
y_pred = model.predict([input_arr[:, 0], input_arr[:, 1]])
|
||||||
|
predicted = np.asarray(y_pred[0]>0.5).astype(output_arr.dtype)
|
||||||
|
expected = output_arr[0]
|
||||||
|
if np.all(predicted == expected):
|
||||||
|
continue
|
||||||
|
word = example.features.feature['word'].bytes_list.value[0].decode()
|
||||||
|
phoneme1 = example.features.feature['phoneme1'].bytes_list.value[0].decode()
|
||||||
|
phoneme2 = example.features.feature['phoneme2'].bytes_list.value[0].decode()
|
||||||
|
voice1 = example.features.feature['voice1'].bytes_list.value[0].decode()
|
||||||
|
voice2 = example.features.feature['voice2'].bytes_list.value[0].decode()
|
||||||
|
language = example.features.feature['language'].bytes_list.value[0].decode()
|
||||||
|
rate1 = example.features.feature['rate1'].int64_list.value[0]
|
||||||
|
rate2 = example.features.feature['rate2'].int64_list.value[0]
|
||||||
|
variant1 = example.features.feature['variant1'].bytes_list.value[0].decode()
|
||||||
|
variant2 = example.features.feature['variant2'].bytes_list.value[0].decode()
|
||||||
|
file1 = example.features.feature['file1'].bytes_list.value[0].decode()
|
||||||
|
file2 = example.features.feature['file2'].bytes_list.value[0].decode()
|
||||||
|
print(phoneme1,phoneme2,voice1,voice2,rate1,rate2,variant1,variant2,file1,file2)
|
||||||
|
result_csv_w.writerow([phoneme1,phoneme2,voice1,voice2,rate1,rate2,variant1,variant2,file1,file2])
|
||||||
|
result_csv.close()
|
||||||
|
|
||||||
|
|
||||||
|
def play_results(audio_group='audio'):
|
||||||
|
result_data = pd.read_csv('./outputs/' + audio_group + '.results.csv')
|
||||||
|
play_file,close_player = file_player()
|
||||||
|
quit = False
|
||||||
|
for (i,r) in result_data.iterrows():
|
||||||
|
if quit:
|
||||||
|
break
|
||||||
|
keys = ["phoneme1","phoneme2","voice1","voice2","rate1","rate2","variant1","variant2"]
|
||||||
|
row_vals = [str(r[k]) for k in keys]
|
||||||
|
h_str = '\t'.join(keys)
|
||||||
|
row_str = '\t'.join(row_vals)
|
||||||
|
while True:
|
||||||
|
print(h_str)
|
||||||
|
print(row_str)
|
||||||
|
play_file('./outputs/'+audio_group+'/'+r['file1'],True)
|
||||||
|
play_file('./outputs/'+audio_group+'/'+r['file2'],True)
|
||||||
|
a = input("press 'r/q/[Enter]' to replay/quit/continue:\t")
|
||||||
|
if a == 'r':
|
||||||
|
continue
|
||||||
|
if a == 'q':
|
||||||
|
quit = True
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
close_player()
|
||||||
|
|
||||||
|
# evaluate_siamese('story_words',model_file='siamese_speech_model-305-epoch-0.20-acc.h5')
|
||||||
|
play_results('story_words')
|
||||||
|
# test_with('rand_edu')
|
||||||
# sunflower_data,sunflower_result = get_word_pairs_data('sweater',15)
|
# sunflower_data,sunflower_result = get_word_pairs_data('sweater',15)
|
||||||
# print(np.argmax(model.predict([sunflower_data[:, 0], sunflower_data[:, 1]]),axis=1))
|
# print(np.argmax(model.predict([sunflower_data[:, 0], sunflower_data[:, 1]]),axis=1))
|
||||||
# print(sunflower_result)
|
# print(sunflower_result)
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty
|
||||||
from AppKit import NSSpeechModePhoneme
|
from AppKit import NSSpeechModePhoneme
|
||||||
from Foundation import NSURL
|
from Foundation import NSURL
|
||||||
import json
|
import json
|
||||||
|
import csv
|
||||||
import random
|
import random
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
@ -81,6 +82,11 @@ class SynthFile(object):
|
||||||
|
|
||||||
return ','.join([str(c) for c in cols])+'\n'
|
return ','.join([str(c) for c in cols])+'\n'
|
||||||
|
|
||||||
|
def get_values(self):
|
||||||
|
cols = [self.word, self.phoneme, self.voice,
|
||||||
|
self.voice_lang, self.rate, self.variant,
|
||||||
|
self.filename]
|
||||||
|
return [str(c) for c in cols]
|
||||||
|
|
||||||
class SynthVariant(object):
|
class SynthVariant(object):
|
||||||
"""docstring for SynthVariant."""
|
"""docstring for SynthVariant."""
|
||||||
|
|
@ -191,22 +197,11 @@ def synth_generator():
|
||||||
print("It took {} to synthsize all variants.".format(time_str))
|
print("It took {} to synthsize all variants.".format(time_str))
|
||||||
return synth_for_words
|
return synth_for_words
|
||||||
|
|
||||||
|
|
||||||
def write_synths(synth_list, fname, csv=False):
|
|
||||||
f = open(fname, 'w')
|
|
||||||
if csv:
|
|
||||||
for s in synth_list:
|
|
||||||
f.write(s.get_csv())
|
|
||||||
else:
|
|
||||||
json.dump([s.get_json() for s in synth_list], f)
|
|
||||||
f.close()
|
|
||||||
|
|
||||||
|
|
||||||
def synth_logger(fname, csv=False):
|
def synth_logger(fname, csv=False):
|
||||||
f = open(fname, 'w')
|
f = open(fname, 'w')
|
||||||
|
s_csv_w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
|
||||||
def csv_writer(s):
|
def csv_writer(s):
|
||||||
f.write(s.get_csv())
|
s_csv_w.writerow(s.get_values())
|
||||||
synth_list = []
|
synth_list = []
|
||||||
|
|
||||||
def json_writer(s):
|
def json_writer(s):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue