Malar Kannan 2017-11-14 21:59:15 +05:30
commit 036667d1c7
11 changed files with 759 additions and 425 deletions

View File

@ -1,25 +0,0 @@
import multiprocessing
import pandas as pd
import numpy as np
def _apply_df(args):
df, func, num, kwargs = args
return num, df.apply(func, **kwargs)
def apply_by_multiprocessing(df,func,**kwargs):
cores = multiprocessing.cpu_count()
workers=kwargs.pop('workers') if 'workers' in kwargs else cores
pool = multiprocessing.Pool(processes=workers)
result = pool.map(_apply_df, [(d, func, i, kwargs) for i,d in enumerate(np.array_split(df, workers))])
pool.close()
result=sorted(result,key=lambda x:x[0])
return pd.concat([i[1] for i in result])
def square(x):
return x**x
if __name__ == '__main__':
df = pd.DataFrame({'a':range(10), 'b':range(10)})
apply_by_multiprocessing(df, square, axis=1, workers=4)

View File

@ -1,42 +0,0 @@
import pyaudio
import numpy as np
# from matplotlib import pyplot as plt
from spectro_gen import plot_stft, generate_spec_frec
def record_spectrogram(n_sec, plot=False, playback=False):
SAMPLE_RATE = 22050
N_CHANNELS = 2
N_SEC = n_sec
CHUNKSIZE = int(SAMPLE_RATE * N_SEC / N_CHANNELS) # fixed chunk size
# show_record_prompt()
input('Press [Enter] to start recording sample... ')
p_inp = pyaudio.PyAudio()
stream = p_inp.open(
format=pyaudio.paFloat32,
channels=N_CHANNELS,
rate=SAMPLE_RATE,
input=True,
frames_per_buffer=CHUNKSIZE)
data = stream.read(CHUNKSIZE)
numpydata = np.frombuffer(data, dtype=np.float32)
multi_channel = np.abs(np.reshape(numpydata, (-1, 2))).mean(axis=1)
one_channel = np.asarray([multi_channel, -1 * multi_channel]).T.reshape(-1)
mean_channel_data = one_channel.tobytes()
stream.stop_stream()
stream.close()
p_inp.terminate()
if plot:
plot_stft(one_channel, SAMPLE_RATE)
if playback:
p_oup = pyaudio.PyAudio()
stream = p_oup.open(
format=pyaudio.paFloat32,
channels=2,
rate=SAMPLE_RATE,
output=True)
stream.write(mean_channel_data)
stream.close()
p_oup.terminate()
ims, _ = generate_spec_frec(one_channel, SAMPLE_RATE)
return ims

76
requirements-linux.txt Normal file
View File

@ -0,0 +1,76 @@
bleach==1.5.0
click==6.7
cloudpickle==0.4.1
cycler==0.10.0
dask==0.15.4
decorator==4.1.2
distributed==1.19.3
entrypoints==0.2.3
enum34==1.1.6
futures==3.1.1
h5py==2.7.1
HeapDict==1.0.0
html5lib==0.9999999
ipykernel==4.6.1
ipython==6.2.1
ipython-genutils==0.2.0
ipywidgets==7.0.3
jedi==0.11.0
Jinja2==2.9.6
jsonschema==2.6.0
jupyter==1.0.0
jupyter-client==5.1.0
jupyter-console==5.2.0
jupyter-core==4.3.0
Keras==2.0.8
locket==0.2.0
Markdown==2.6.9
MarkupSafe==1.0
matplotlib==2.1.0
mistune==0.7.4
msgpack-python==0.4.8
nbconvert==5.3.1
nbformat==4.4.0
notebook==5.2.0
numexpr==2.6.4
numpy==1.13.3
pandas==0.20.3
pandocfilters==1.4.2
parso==0.1.0
partd==0.3.8
pexpect==4.2.1
pickleshare==0.7.4
progressbar2==3.34.3
prompt-toolkit==1.0.15
protobuf==3.4.0
psutil==5.4.0
ptyprocess==0.5.2
PyAudio==0.2.11
Pygments==2.2.0
pyparsing==2.2.0
pysndfile==1.0.0
python-dateutil==2.6.1
python-utils==2.2.0
pytz==2017.2
PyYAML==3.12
pyzmq==16.0.2
qtconsole==4.3.1
scikit-learn==0.19.0
scipy==0.19.1
simplegeneric==0.8.1
six==1.11.0
sortedcontainers==1.5.7
tables==3.4.2
tblib==1.3.2
tensorflow==1.3.0
tensorflow-tensorboard==0.4.0rc1
terminado==0.6
testpath==0.3.1
toolz==0.8.2
tornado==4.5.2
tqdm==4.19.4
traitlets==4.3.2
wcwidth==0.1.7
Werkzeug==0.12.2
widgetsnbextension==3.0.6
zict==0.1.3

View File

@ -1,220 +1,252 @@
import pandas as pd import pandas as pd
from pandas_parallel import apply_by_multiprocessing from speech_tools import apply_by_multiprocessing,threadsafe_iter
# import dask as dd # import dask as dd
# import dask.dataframe as ddf # import dask.dataframe as ddf
import tensorflow as tf import tensorflow as tf
from tensorflow.python.ops import data_flow_ops
import numpy as np import numpy as np
from spectro_gen import generate_aiff_spectrogram from speech_spectrum import generate_aiff_spectrogram
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
import itertools import itertools
import os import os
import random import random
import csv import csv
import gc import gc
import pickle
from tqdm import tqdm
def get_siamese_pairs(groupF1, groupF2):
group1 = [r for (i, r) in groupF1.iterrows()]
group2 = [r for (i, r) in groupF2.iterrows()]
diff = [(g1, g2) for g2 in group2 for g1 in group1]
same = [i for i in itertools.combinations(group1, 2)
] + [i for i in itertools.combinations(group2, 2)]
random.shuffle(same)
random.shuffle(diff)
# return (random.sample(same,10), random.sample(diff,10))
return same[:10],diff[:10]
def siamese_pairs(rightGroup, wrongGroup): def siamese_pairs(rightGroup, wrongGroup):
group1 = [r for (i, r) in rightGroup.iterrows()] group1 = [r for (i, r) in rightGroup.iterrows()]
group2 = [r for (i, r) in wrongGroup.iterrows()] group2 = [r for (i, r) in wrongGroup.iterrows()]
rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1] rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]
rightRightPairs = [i for i in itertools.combinations(group1, 2)] rightRightPairs = [i for i in itertools.combinations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)]
random.shuffle(rightWrongPairs) # random.shuffle(rightWrongPairs)
random.shuffle(rightRightPairs) # random.shuffle(rightRightPairs)
# return (random.sample(same,10), random.sample(diff,10)) # return rightRightPairs[:10],rightWrongPairs[:10]
return rightRightPairs[:10],rightWrongPairs[:10] return rightRightPairs[:32],rightWrongPairs[:32]
def append_zeros(spgr, max_samples):
return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], def _float_feature(value):
'median') return tf.train.Feature(float_list=tf.train.FloatList(value=value))
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
def create_spectrogram_tfrecords(audio_group='audio',sample_count=0,train_test_ratio=0.1):
'''
http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/
http://www.machinelearninguru.com/deep_learning/tensorflow/basics/tfrecord/tfrecord.html
'''
audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv',index_col=0)
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
n_records,n_spec,n_features = 0,0,0
def write_samples(wg,sample_name):
word_group_prog = tqdm(wg,desc='Computing spectrogram')
record_file = './outputs/{}.{}.tfrecords'.format(audio_group,sample_name)
writer = tf.python_io.TFRecordWriter(record_file)
for (w, word_group) in word_group_prog:
word_group_prog.set_postfix(word=w,sample_name=sample_name)
g = word_group.reset_index()
g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram)
sample_right = g.loc[g['variant'] == 'low']
sample_wrong = g.loc[g['variant'] == 'medium']
same, diff = siamese_pairs(sample_right, sample_wrong)
groups = [([0,1],same),([1,0],diff)]
for (output,group) in groups:
group_prog = tqdm(group,desc='Writing Spectrogram')
for sample1,sample2 in group_prog:
same = sample1['variant'] == sample2['variant']
phon_same = sample1['phonemes'] == sample2['phonemes']
voice_diff = sample1['voice'] != sample2['voice']
if not same and phon_same:
continue
if same and not voice_diff:
continue
group_prog.set_postfix(output=output
,var1=sample1['variant']
,var2=sample2['variant'])
spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram']
spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0]
spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1]
spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1)
nonlocal n_spec,n_records,n_features
n_spec = max([n_spec,spec_n1,spec_n2])
n_features = spec_w1
n_records+=1
example = tf.train.Example(features=tf.train.Features(
feature={
'word': _bytes_feature([w.encode('utf-8')]),
'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]),
'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]),
'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]),
'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]),
'language': _bytes_feature([sample1['language'].encode('utf-8')]),
'rate1':_int64_feature([sample1['rate']]),
'rate2':_int64_feature([sample2['rate']]),
'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]),
'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]),
'file1': _bytes_feature([sample1['file'].encode('utf-8')]),
'file2': _bytes_feature([sample2['file'].encode('utf-8')]),
'spec1':_float_feature(spec1),
'spec2':_float_feature(spec2),
'spec_n1':_int64_feature([spec_n1]),
'spec_w1':_int64_feature([spec_w1]),
'spec_n2':_int64_feature([spec_n2]),
'spec_w2':_int64_feature([spec_w2]),
'output':_int64_feature(output)
}
))
writer.write(example.SerializeToString())
group_prog.close()
word_group_prog.close()
writer.close()
word_groups = [i for i in audio_samples.groupby('word')]
wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups
tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=train_test_ratio)
write_samples(tr_audio_samples,'train')
write_samples(te_audio_samples,'test')
const_file = os.path.join('./outputs',audio_group+'.constants')
pickle.dump((n_spec,n_features,n_records),open(const_file,'wb'))
def padd_zeros(spgr, max_samples): def padd_zeros(spgr, max_samples):
return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
'constant') 'constant')
def to_onehot(a,class_count=2): def reservoir_sample(iterable, k):
# >>> a = np.array([1, 0, 3]) it = iter(iterable)
a_row_n = a.shape[0] if not (k > 0):
b = np.zeros((a_row_n, class_count)) raise ValueError("sample size must be positive")
b[np.arange(a_row_n), a] = 1
return b
def create_pair(l, r, max_samples): sample = list(itertools.islice(it, k)) # fill the reservoir
l_sample = padd_zeros(l, max_samples) random.shuffle(sample) # if number of items less then *k* then
r_sample = padd_zeros(r, max_samples) # return all items in random order.
return np.asarray([l_sample, r_sample]) for i, item in enumerate(it, start=k+1):
j = random.randrange(i) # random [0..i)
if j < k:
sample[j] = item # replace item with gradually decreasing probability
return sample
def create_test_pair(l, r, max_samples): def read_siamese_tfrecords_generator(audio_group='audio',batch_size=32,test_size=0):
l_sample = append_zeros(l, max_samples) records_file = os.path.join('./outputs',audio_group+'.train.tfrecords')
r_sample = append_zeros(r, max_samples) input_pairs = []
return np.asarray([[l_sample, r_sample]]) output_class = []
const_file = os.path.join('./outputs',audio_group+'.constants')
(n_spec,n_features,n_records) = pickle.load(open(const_file,'rb'))
print('reading tfrecords({}-train)...'.format(audio_group))
# @threadsafe_iter
def record_generator():
input_data = []
output_data = []
while True:
record_iterator = tf.python_io.tf_record_iterator(path=records_file)
#tqdm(enumerate(record_iterator),total=n_records)
for (i,string_record) in enumerate(record_iterator):
example = tf.train.Example()
example.ParseFromString(string_record)
spec_n1 = example.features.feature['spec_n1'].int64_list.value[0]
spec_n2 = example.features.feature['spec_n2'].int64_list.value[0]
spec_w1 = example.features.feature['spec_w1'].int64_list.value[0]
spec_w2 = example.features.feature['spec_w2'].int64_list.value[0]
spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1)
spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2)
p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec)
input_data.append(np.asarray([p_spec1,p_spec2]))
output = example.features.feature['output'].int64_list.value
output_data.append(np.asarray(output))
if len(input_data) == batch_size:
input_arr = np.asarray(input_data)
output_arr = np.asarray(output_data)
yield ([input_arr[:, 0], input_arr[:, 1]],output_arr)
input_data = []
output_data = []
def create_X(sp, max_samples): # Read test in one-shot
return create_pair(sp[0]['spectrogram'], sp[1]['spectrogram'], max_samples) te_records_file = os.path.join('./outputs',audio_group+'.test.tfrecords')
te_re_iterator = tf.python_io.tf_record_iterator(path=records_file)
te_n_records = len([i for i in te_re_iterator])
te_re_iterator = tf.python_io.tf_record_iterator(path=records_file)
print('reading tfrecords({}-test)...'.format(audio_group))
test_size = min([test_size,te_n_records]) if test_size > 0 else te_n_records
input_data = np.zeros((test_size,2,n_spec,n_features))
output_data = np.zeros((test_size,2))
random_samples = enumerate(reservoir_sample(te_re_iterator,test_size))
for (i,string_record) in tqdm(random_samples,total=test_size):
example = tf.train.Example()
example.ParseFromString(string_record)
spec_n1 = example.features.feature['spec_n1'].int64_list.value[0]
spec_n2 = example.features.feature['spec_n2'].int64_list.value[0]
spec_w1 = example.features.feature['spec_w1'].int64_list.value[0]
spec_w2 = example.features.feature['spec_w2'].int64_list.value[0]
spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1)
spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2)
p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec)
input_data[i] = np.asarray([p_spec1,p_spec2])
output = example.features.feature['output'].int64_list.value
output_data[i] = np.asarray(output)
return record_generator,input_data,output_data,n_spec,n_features,n_records
# def get_word_pairs_data(word, max_samples): def audio_samples_word_count(audio_group='audio'):
# audio_samples = pd.read_csv( audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv')
# './outputs/audio.csv', return len(audio_samples.groupby(audio_samples['word']))
# names=['word', 'voice', 'rate', 'variant', 'file'])
# audio_samples = audio_samples.loc[audio_samples['word'] ==
# word].reset_index(drop=True)
# audio_samples.loc[:, 'spectrogram'] = audio_samples.loc[:, 'file'].apply(
# lambda x: 'outputs/audio/' + x).apply(generate_aiff_spectrogram)
# max_samples = audio_samples['spectrogram'].apply(
# lambda x: x.shape[0]).max()
# same_data, diff_data = [], []
# for (w, g) in audio_samples.groupby(audio_samples['word']):
# sample_norm = g.loc[audio_samples['variant'] == 'normal']
# sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
# same, diff = get_siamese_pairs(sample_norm, sample_phon)
# same_data.extend([create_X(s, max_samples) for s in same])
# diff_data.extend([create_X(d, max_samples) for d in diff])
# Y = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
# X = np.asarray(same_data + diff_data)
# # tr_pairs, te_pairs, tr_y, te_y = train_test_split(X, Y, test_size=0.1)
# return (X, Y)
def record_generator_count(records_file):
record_iterator = tf.python_io.tf_record_iterator(path=records_file)
count = len([i for i in record_iterator])
record_iterator = tf.python_io.tf_record_iterator(path=records_file)
return record_iterator,count
def create_spectrogram_data(audio_group='audio'): def fix_csv(audio_group='audio'):
audio_csv_lines = open('./outputs/' + audio_group + '.csv.orig','r').readlines()
audio_csv_data = [i.strip().split(',') for i in audio_csv_lines]
proper_rows = [i for i in audio_csv_data if len(i) == 7]
with open('./outputs/' + audio_group + '.csv','w') as fixed_csv:
fixed_csv_w = csv.writer(fixed_csv, quoting=csv.QUOTE_MINIMAL)
fixed_csv_w.writerows(proper_rows)
audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv' audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv'
, names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'] , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file'])
, quoting=csv.QUOTE_NONE)
# audio_samples = audio_samples.loc[audio_samples['word'] ==
# 'sunflowers'].reset_index(drop=True)
audio_samples['file_paths'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_paths'], os.path.exists)
audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index()
audio_samples['spectrogram'] = apply_by_multiprocessing(audio_samples['file_paths'],generate_aiff_spectrogram)#.apply(
audio_samples['window_count'] = audio_samples.loc[:,'spectrogram'].apply(lambda x: x.shape[0])
audio_samples.to_pickle('outputs/{}-spectrogram.pkl'.format(audio_group))
def create_spectrogram_tfrecords(audio_group='audio'):
audio_samples = pd.read_csv( './outputs/' + audio_group + '.csv'
, names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']
, quoting=csv.QUOTE_NONE)
# audio_samples = audio_samples.loc[audio_samples['word'] ==
# 'sunflowers'].reset_index(drop=True)
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x) audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists) audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists)
audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index() audio_samples = audio_samples[audio_samples['file_exists'] == True]
audio_samples = audio_samples.drop(['file_path','file_exists'],axis=1).reset_index(drop=True)
def _float_feature(value): audio_samples.to_csv('./outputs/' + audio_group + '.csv')
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
writer = tf.python_io.TFRecordWriter('./outputs/' + audio_group + '.tfrecords')
# audio_samples = audio_samples[:100]
for (w, word_group) in audio_samples.groupby(audio_samples['word']):
g = word_group.reset_index()
g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram)
sample_right = g.loc[audio_samples['variant'] == 'low']
sample_wrong = g.loc[audio_samples['variant'] == 'medium']
same, diff = siamese_pairs(sample_right, sample_wrong)
groups = [([0,1],same),([1,0],diff)]
for (output,group) in groups:
for sample1,sample2 in group:
spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram']
spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0]
spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1]
spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1)
example = tf.train.Example(features=tf.train.Features(
feature={
'word': _bytes_feature([w.encode('utf-8')]),
'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]),
'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]),
'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]),
'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]),
'language': _bytes_feature([sample1['language'].encode('utf-8')]),
'rate1':_int64_feature([sample1['rate']]),
'rate2':_int64_feature([sample2['rate']]),
'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]),
'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]),
'file1': _bytes_feature([sample1['file'].encode('utf-8')]),
'file2': _bytes_feature([sample2['file'].encode('utf-8')]),
'spec1':_float_feature(spec1),
'spec2':_float_feature(spec2),
'spec_n1':_int64_feature([spec_n1]),
'spec_w1':_int64_feature([spec_w1]),
'spec_n2':_int64_feature([spec_n2]),
'spec_w2':_int64_feature([spec_w2]),
'output':_int64_feature(output)
}
))
writer.write(example.SerializeToString())
writer.close()
def create_tagged_data(audio_samples):
same_data, diff_data = [], []
for (w, g) in audio_samples.groupby(audio_samples['word']):
# sample_norm = g.loc[audio_samples['variant'] == 'low']
# sample_phon = g.loc[audio_samples['variant'] == 'medium']
sample_norm = g.loc[audio_samples['variant'] == 'normal']
sample_phon = g.loc[audio_samples['variant'] == 'phoneme']
same, diff = get_siamese_pairs(sample_norm, sample_phon)
same_data.extend([create_X(s) for s in same])
diff_data.extend([create_X(d) for d in diff])
print('creating all speech pairs')
Y_f = np.hstack([np.ones(len(same_data)), np.zeros(len(diff_data))])
Y = to_onehot(Y_f.astype(np.int8))
print('casting as array speech pairs')
X = np.asarray(same_data + diff_data)
return X,Y
def create_speech_pairs_data(audio_group='audio'):
audio_samples = pd.read_pickle('outputs/{}-spectrogram.pkl'.format(audio_group))
# sample_size = audio_samples['spectrogram'][0].shape[1]
tr_audio_samples,te_audio_samples = train_test_split(audio_samples, test_size=0.1)
def save_samples_for(sample_name,samples):
print('generating {} siamese speech pairs'.format(sample_name))
X,Y = create_tagged_data(samples)
print('shuffling array speech pairs')
rng_state = np.random.get_state()
np.random.shuffle(X)
np.random.set_state(rng_state)
np.random.shuffle(Y)
print('pickling X/Y')
np.save('outputs/{}-train-X.npy'.format(audio_group), X)
np.save('outputs/{}-train-Y.npy'.format(audio_group), Y)
save_samples_for('train',tr_audio_samples)
save_samples_for('test',te_audio_samples)
def speech_data(audio_group='audio'):
X = np.load('outputs/{}-X.npy'.format(audio_group)) / 255.0
Y = np.load('outputs/{}-Y.npy'.format(audio_group))
return (X,Y)
def speech_model_data():
tr_pairs = np.load('outputs/tr_pairs.npy') / 255.0
te_pairs = np.load('outputs/te_pairs.npy') / 255.0
tr_pairs[tr_pairs < 0] = 0
te_pairs[te_pairs < 0] = 0
tr_y = np.load('outputs/tr_y.npy')
te_y = np.load('outputs/te_y.npy')
return tr_pairs, te_pairs, tr_y, te_y
def convert_old_audio():
audio_samples = pd.read_csv( './outputs/audio.csv.old'
, names=['word', 'voice', 'rate', 'variant', 'file'])
audio_samples['phonemes'] = 'unknown'
audio_samples['language'] = 'en-US'
audio_samples.loc[audio_samples['variant'] == 'normal','variant'] = 'low'
audio_samples.loc[audio_samples['variant'] == 'phoneme','variant'] = 'medium'
audio_samples = audio_samples[['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']]
audio_samples.to_csv('./outputs/audio_new.csv',index=False,header=False)
if __name__ == '__main__': if __name__ == '__main__':
# sunflower_pairs_data() # sunflower_pairs_data()
# create_spectrogram_data() # create_spectrogram_data()
# create_spectrogram_data('story_words') # create_spectrogram_data('story_words')
create_spectrogram_tfrecords('story_words') # create_spectrogram_tfrecords('story_words')
# create_spectrogram_tfrecords('story_words_test')
# read_siamese_tfrecords('story_all')
# read_siamese_tfrecords('story_words_test')
# padd_zeros_siamese_tfrecords('story_words')
# fix_csv('story_words')
# pickle_constants('story_words')
# create_spectrogram_tfrecords('audio',sample_count=100)
# create_spectrogram_tfrecords('story_all',sample_count=25)
# fix_csv('story_words_test')
create_spectrogram_tfrecords('story_words_test',sample_count=100,train_test_ratio=0.0)
# create_spectrogram_tfrecords('audio',sample_count=50)
# read_siamese_tfrecords_generator('audio')
# padd_zeros_siamese_tfrecords('audio')
# create_padded_spectrogram() # create_padded_spectrogram()
# create_speech_pairs_data() # create_speech_pairs_data()
# print(speech_model_data()) # print(speech_model_data())

164
speech_model.py Normal file
View File

@ -0,0 +1,164 @@
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
# from speech_data import speech_model_data
from speech_data import read_siamese_tfrecords_generator
from keras.models import Model,load_model,model_from_yaml
from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate
from keras.losses import categorical_crossentropy
# from keras.losses import binary_crossentropy
from keras.utils import to_categorical
# from keras.utils.np_utils import to_categorical
from keras.optimizers import RMSprop
from keras.callbacks import TensorBoard, ModelCheckpoint
from keras import backend as K
from speech_tools import create_dir
# def euclidean_distance(vects):
# x, y = vects
# return K.sqrt(
# K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
#
#
# def eucl_dist_output_shape(shapes):
# shape1, shape2 = shapes
# return (shape1[0], 1)
#
#
# def contrastive_loss(y_true, y_pred):
# '''Contrastive loss from Hadsell-et-al.'06
# http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
# '''
# return K.mean(y_true * K.square(y_pred) +
# (1 - y_true) * K.square(K.maximum(1 - y_pred, 0)))
def create_base_rnn_network(input_dim):
'''Base network to be shared (eq. to feature extraction).
'''
inp = Input(shape=input_dim)
# ls0 = LSTM(512, return_sequences=True)(inp)
ls1 = LSTM(256, return_sequences=True)(inp)
ls2 = LSTM(128, return_sequences=True)(ls1)
# ls3 = LSTM(32, return_sequences=True)(ls2)
ls4 = LSTM(64)(ls2)
# d1 = Dense(128, activation='relu')(ls4)
d2 = Dense(64, activation='relu')(ls4)
return Model(inp, ls4)
def compute_accuracy(y_true, y_pred):
'''Compute classification accuracy with a fixed threshold on distances.
'''
pred = y_pred.ravel() > 0.5
return np.mean(pred == y_true)
def accuracy(y_true, y_pred):
'''Compute classification accuracy with a fixed threshold on distances.
'''
return K.mean(K.equal(y_true, K.cast(y_pred > 0.5, y_true.dtype)))
def dense_classifier(processed):
conc_proc = Concatenate()(processed)
d1 = Dense(64, activation='relu')(conc_proc)
# dr1 = Dropout(0.1)(d1)
# d2 = Dense(128, activation='relu')(d1)
d3 = Dense(8, activation='relu')(d1)
# dr2 = Dropout(0.1)(d2)
return Dense(2, activation='softmax')(d3)
def siamese_model(input_dim):
# input_dim = (15, 1654)
base_network = create_base_rnn_network(input_dim)
input_a = Input(shape=input_dim)
input_b = Input(shape=input_dim)
processed_a = base_network(input_a)
processed_b = base_network(input_b)
final_output = dense_classifier([processed_a,processed_b])
model = Model([input_a, input_b], final_output)
# distance = Lambda(
# euclidean_distance,
# output_shape=eucl_dist_output_shape)([processed_a, processed_b])
# model = Model([input_a, input_b], distance)
return model
def write_model_arch(mod,mod_file):
model_f = open(mod_file,'w')
model_f.write(mod.to_yaml())
model_f.close()
def load_model_arch(mod_file):
model_f = open(mod_file,'r')
mod = model_from_yaml(model_f.read())
model_f.close()
return mod
def train_siamese(audio_group = 'audio'):
# the data, shuffled and split between train and test sets
# tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data()
batch_size = 128
model_dir = './models/'+audio_group
create_dir(model_dir)
log_dir = './logs/'+audio_group
create_dir(log_dir)
tr_gen_fn,te_pairs,te_y,n_step,n_features,n_records = read_siamese_tfrecords_generator(audio_group,batch_size=batch_size,test_size=batch_size)
tr_gen = tr_gen_fn()
# tr_y = to_categorical(tr_y_e, num_classes=2)
# te_y = to_categorical(te_y_e, num_classes=2)
input_dim = (n_step, n_features)
model = siamese_model(input_dim)
tb_cb = TensorBoard(
log_dir=log_dir,
histogram_freq=1,
batch_size=32,
write_graph=True,
write_grads=True,
write_images=True,
embeddings_freq=0,
embeddings_layer_names=None,
embeddings_metadata=None)
cp_file_fmt = model_dir+'/siamese_speech_model-{epoch:02d}-epoch-{val_loss:0.2f}\
-acc.h5'
cp_cb = ModelCheckpoint(
cp_file_fmt,
monitor='val_loss',
verbose=0,
save_best_only=True,
save_weights_only=True,
mode='auto',
period=1)
# train
rms = RMSprop()#lr=0.001
model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy])
write_model_arch(model,model_dir+'/siamese_speech_model_arch.yaml')
# model.fit(
# [tr_pairs[:, 0], tr_pairs[:, 1]],
# tr_y,
# batch_size=128,
# epochs=100,
# validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),
# callbacks=[tb_cb, cp_cb])
model.fit_generator(tr_gen
,epochs=1000
,steps_per_epoch=n_records//batch_size
,validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y)
,use_multiprocessing=True, workers=1
,callbacks=[tb_cb, cp_cb])
model.save(model_dir+'/siamese_speech_model-final.h5')
# compute final accuracy on training and test sets
# y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
# tr_acc = compute_accuracy(tr_y, y_pred)
# print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
te_acc = compute_accuracy(te_y, y_pred)
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
if __name__ == '__main__':
train_siamese('story_words')
# train_siamese('audio')

View File

@ -3,6 +3,7 @@ from AppKit import NSSpeechSynthesizer, NSSpeechInputModeProperty
from AppKit import NSSpeechModePhoneme from AppKit import NSSpeechModePhoneme
from Foundation import NSURL from Foundation import NSURL
import json import json
import csv
import random import random
import os import os
import re import re
@ -11,6 +12,7 @@ import time
import progressbar import progressbar
from generate_similar import similar_phoneme_phrase,similar_phrase from generate_similar import similar_phoneme_phrase,similar_phrase
from speech_tools import format_filename
OUTPUT_NAME = 'story_words_test' OUTPUT_NAME = 'story_words_test'
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/' dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
@ -39,7 +41,10 @@ def create_dir(direc):
def dest_filename(w, v, r, t): def dest_filename(w, v, r, t):
return '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, str(random.randint(0, 10000))) rand_no = str(random.randint(0, 10000))
fname = '{}-{}-{}-{}-{}.aiff'.format(w, v, r, t, rand_no)
sanitized = format_filename(fname)
return sanitized
def dest_path(v, r, n): def dest_path(v, r, n):
@ -81,6 +86,11 @@ class SynthFile(object):
return ','.join([str(c) for c in cols])+'\n' return ','.join([str(c) for c in cols])+'\n'
def get_values(self):
cols = [self.word, self.phoneme, self.voice,
self.voice_lang, self.rate, self.variant,
self.filename]
return [str(c) for c in cols]
class SynthVariant(object): class SynthVariant(object):
"""docstring for SynthVariant.""" """docstring for SynthVariant."""
@ -191,22 +201,11 @@ def synth_generator():
print("It took {} to synthsize all variants.".format(time_str)) print("It took {} to synthsize all variants.".format(time_str))
return synth_for_words return synth_for_words
def write_synths(synth_list, fname, csv=False):
f = open(fname, 'w')
if csv:
for s in synth_list:
f.write(s.get_csv())
else:
json.dump([s.get_json() for s in synth_list], f)
f.close()
def synth_logger(fname, csv=False): def synth_logger(fname, csv=False):
f = open(fname, 'w') f = open(fname, 'w')
s_csv_w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
def csv_writer(s): def csv_writer(s):
f.write(s.get_csv()) s_csv_w.writerow(s.get_values())
synth_list = [] synth_list = []
def json_writer(s): def json_writer(s):

View File

@ -1,135 +0,0 @@
from __future__ import absolute_import
from __future__ import print_function
import numpy as np
from speech_data import speech_model_data
from keras.models import Model,load_model
from keras.layers import Input, Dense, Dropout, LSTM, Lambda, Concatenate
from keras.losses import categorical_crossentropy
# from keras.losses import binary_crossentropy
from keras.utils import to_categorical
# from keras.utils.np_utils import to_categorical
from keras.optimizers import RMSprop
from keras.callbacks import TensorBoard, ModelCheckpoint
from keras import backend as K
def euclidean_distance(vects):
x, y = vects
return K.sqrt(
K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
def eucl_dist_output_shape(shapes):
shape1, shape2 = shapes
return (shape1[0], 1)
def contrastive_loss(y_true, y_pred):
'''Contrastive loss from Hadsell-et-al.'06
http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
'''
return K.mean(y_true * K.square(y_pred) +
(1 - y_true) * K.square(K.maximum(1 - y_pred, 0)))
def create_base_rnn_network(input_dim):
'''Base network to be shared (eq. to feature extraction).
'''
inp = Input(shape=input_dim)
ls1 = LSTM(256, return_sequences=True)(inp)
ls2 = LSTM(128, return_sequences=True)(ls1)
# ls3 = LSTM(32, return_sequences=True)(ls2)
ls4 = LSTM(64)(ls2)
return Model(inp, ls4)
def compute_accuracy(y_true, y_pred):
'''Compute classification accuracy with a fixed threshold on distances.
'''
pred = y_pred.ravel() < 0.5
return np.mean(pred == y_true)
def accuracy(y_true, y_pred):
'''Compute classification accuracy with a fixed threshold on distances.
'''
return K.mean(K.equal(y_true, K.cast(y_pred > 0.5, y_true.dtype)))
def dense_classifier(processed):
conc_proc = Concatenate()(processed)
d1 = Dense(16, activation='relu')(conc_proc)
# dr1 = Dropout(0.1)(d1)
d2 = Dense(8, activation='relu')(d1)
# dr2 = Dropout(0.1)(d2)
return Dense(2, activation='softmax')(d2)
def siamese_model(input_dim):
# input_dim = (15, 1654)
base_network = create_base_rnn_network(input_dim)
input_a = Input(shape=input_dim)
input_b = Input(shape=input_dim)
processed_a = base_network(input_a)
processed_b = base_network(input_b)
final_output = dense_classifier([processed_a,processed_b])
model = Model([input_a, input_b], final_output)
# distance = Lambda(
# euclidean_distance,
# output_shape=eucl_dist_output_shape)([processed_a, processed_b])
# model = Model([input_a, input_b], distance)
return model
def train_siamese():
# the data, shuffled and split between train and test sets
tr_pairs, te_pairs, tr_y_e, te_y_e = speech_model_data()
tr_y = to_categorical(tr_y_e, num_classes=2)
te_y = to_categorical(te_y_e, num_classes=2)
input_dim = (tr_pairs.shape[2], tr_pairs.shape[3])
model = siamese_model(input_dim)
tb_cb = TensorBoard(
log_dir='./logs/siamese_logs',
histogram_freq=1,
batch_size=32,
write_graph=True,
write_grads=True,
write_images=True,
embeddings_freq=0,
embeddings_layer_names=None,
embeddings_metadata=None)
cp_file_fmt = './models/siamese_speech_model-{epoch:02d}-epoch-{val_loss:0.2f}\
-acc.h5'
cp_cb = ModelCheckpoint(
cp_file_fmt,
monitor='val_loss',
verbose=0,
save_best_only=False,
save_weights_only=False,
mode='auto',
period=1)
# train
rms = RMSprop(lr=0.001)
model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy])
model.fit(
[tr_pairs[:, 0], tr_pairs[:, 1]],
tr_y,
batch_size=128,
epochs=50,
validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),
callbacks=[tb_cb, cp_cb])
model.save('./models/siamese_speech_model-final.h5')
# compute final accuracy on training and test sets
y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
tr_acc = compute_accuracy(tr_y, y_pred)
y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
te_acc = compute_accuracy(te_y, y_pred)
print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))
if __name__ == '__main__':
train_siamese()

View File

@ -13,6 +13,8 @@ from pysndfile import sndio as snd
from numpy.lib import stride_tricks from numpy.lib import stride_tricks
""" short time fourier transform of audio signal """ """ short time fourier transform of audio signal """
STFT_WINDOWS_MSEC = 20
STFT_WINDOW_OVERLAP = 1.0 / 3
def stft(sig, frameSize, overlapFac=0.5, window=np.hanning): def stft(sig, frameSize, overlapFac=0.5, window=np.hanning):
win = window(frameSize) win = window(frameSize)
@ -74,7 +76,7 @@ def logscale_spec(spec, sr=44100, factor=20.):
def generate_spec_frec(samples, samplerate): def generate_spec_frec(samples, samplerate):
# samplerate, samples = wav.read(audiopath) # samplerate, samples = wav.read(audiopath)
# s = stft(samples, binsize) # s = stft(samples, binsize)
s = stft(samples, samplerate * 150 // 1000, 1.0 / 3) s = stft(samples, samplerate * STFT_WINDOWS_MSEC // 1000, STFT_WINDOW_OVERLAP)
sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate) sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate)
ims = 20. * np.log10(np.abs(sshow) / 10e-6) ims = 20. * np.log10(np.abs(sshow) / 10e-6)
@ -141,8 +143,11 @@ def play_sunflower():
if __name__ == '__main__': if __name__ == '__main__':
play_sunflower() # play_sunflower()
# plot_aiff_stft('./outputs/sunflowers-Alex-150-normal-589.aiff') plot_aiff_stft('./outputs/story_words/Agnes/150/chicken-Agnes-150-low-1077.aiff')
plot_aiff_stft('./outputs/story_words/Agnes/150/chicken-Agnes-150-medium-1762.aiff')
# spec = generate_aiff_spectrogram('./outputs/story_words/Agnes/150/chicken-Agnes-150-low-1077.aiff')
# print(spec.shape)
# plot_aiff_stft('./outputs/sunflowers-Alex-180-normal-4763.aiff') # plot_aiff_stft('./outputs/sunflowers-Alex-180-normal-4763.aiff')
# plot_aiff_stft('./outputs/sunflowers-Victoria-180-normal-870.aiff') # plot_aiff_stft('./outputs/sunflowers-Victoria-180-normal-870.aiff')
# plot_aiff_stft('./outputs/sunflowers-Fred-180-phoneme-9733.aiff') # plot_aiff_stft('./outputs/sunflowers-Fred-180-phoneme-9733.aiff')

138
speech_test.py Normal file
View File

@ -0,0 +1,138 @@
from speech_model import load_model_arch
from speech_tools import record_spectrogram, file_player
from speech_data import record_generator_count
# from importlib import reload
# import speech_data
# reload(speech_data)
import numpy as np
import pandas as pd
import os
import pickle
import tensorflow as tf
import csv
from tqdm import tqdm
from speech_data import padd_zeros
def predict_recording_with(m,sample_size=15):
spec1 = record_spectrogram(n_sec=1.4)
spec2 = record_spectrogram(n_sec=1.4)
inp = create_test_pair(spec1,spec2,sample_size)
return m.predict([inp[:, 0], inp[:, 1]])
def test_with(audio_group):
X,Y = speech_data(audio_group)
print(np.argmax(model.predict([X[:, 0], X[:, 1]]),axis=1))
print(Y.astype(np.int8))
def evaluate_siamese(records_file,audio_group='audio',weights = 'siamese_speech_model-final.h5'):
# audio_group='audio';model_file = 'siamese_speech_model-305-epoch-0.20-acc.h5'
# records_file = os.path.join('./outputs',eval_group+'.train.tfrecords')
const_file = os.path.join('./outputs',audio_group+'.constants')
arch_file='./models/'+audio_group+'/siamese_speech_model_arch.yaml'
weight_file='./models/'+audio_group+'/'+weights
(n_spec,n_features,n_records) = pickle.load(open(const_file,'rb'))
print('evaluating {}...'.format(records_file))
model = load_model_arch(arch_file)
# model = siamese_model((n_spec, n_features))
model.load_weights(weight_file)
record_iterator,records_count = record_generator_count(records_file)
total,same_success,diff_success,skipped,same_failed,diff_failed = 0,0,0,0,0,0
all_results = []
for (i,string_record) in tqdm(enumerate(record_iterator),total=records_count):
# string_record = next(record_iterator)
total+=1
example = tf.train.Example()
example.ParseFromString(string_record)
spec_n1 = example.features.feature['spec_n1'].int64_list.value[0]
spec_n2 = example.features.feature['spec_n2'].int64_list.value[0]
if n_spec < spec_n1 or n_spec < spec_n2:
skipped+=1
continue
spec_w1 = example.features.feature['spec_w1'].int64_list.value[0]
spec_w2 = example.features.feature['spec_w2'].int64_list.value[0]
spec1 = np.array(example.features.feature['spec1'].float_list.value).reshape(spec_n1,spec_w1)
spec2 = np.array(example.features.feature['spec2'].float_list.value).reshape(spec_n2,spec_w2)
word = example.features.feature['word'].bytes_list.value[0].decode()
phoneme1 = example.features.feature['phoneme1'].bytes_list.value[0].decode()
phoneme2 = example.features.feature['phoneme2'].bytes_list.value[0].decode()
voice1 = example.features.feature['voice1'].bytes_list.value[0].decode()
voice2 = example.features.feature['voice2'].bytes_list.value[0].decode()
language = example.features.feature['language'].bytes_list.value[0].decode()
rate1 = example.features.feature['rate1'].int64_list.value[0]
rate2 = example.features.feature['rate2'].int64_list.value[0]
variant1 = example.features.feature['variant1'].bytes_list.value[0].decode()
variant2 = example.features.feature['variant2'].bytes_list.value[0].decode()
file1 = example.features.feature['file1'].bytes_list.value[0].decode()
file2 = example.features.feature['file2'].bytes_list.value[0].decode()
p_spec1,p_spec2 = padd_zeros(spec1,n_spec),padd_zeros(spec2,n_spec)
input_arr = np.asarray([[p_spec1,p_spec2]])
output_arr = np.asarray([example.features.feature['output'].int64_list.value])
y_pred = model.predict([input_arr[:, 0], input_arr[:, 1]])
predicted = np.asarray(y_pred[0]>0.5).astype(output_arr.dtype)
expected = output_arr[0]
status = np.all(predicted == expected)
result = {"phoneme1":phoneme1,"phoneme2":phoneme2,"voice1":voice1
,"voice2":voice2,"rate1":rate1,"rate2":rate2
,"variant1":variant1,"variant2":variant2,"file1":file1
,"file2":file2,"expected":expected[0],"predicted":y_pred[0][0]
,"success":status}
all_results.append(result)
if status:
if variant1 == variant2:
same_success+=1
else:
diff_success+=1
continue
else:
if variant1 == variant2:
same_failed+=1
else:
diff_failed+=1
print('total-{},same_success-{},diff_success-{},skipped-{},same_failed-{},diff_failed-{}'.format(total,same_success,diff_success,skipped,same_failed,diff_failed))
success = same_success+diff_success
failure = same_failed+diff_failed
print('accuracy-{:.3f}'.format(success*100/(success+failure)))
print('same_accuracy-{:.3f}'.format(same_success*100/(same_success+same_failed)))
print('diff_accuracy-{:.3f}'.format(diff_success*100/(diff_success+diff_failed)))
result_data = pd.DataFrame(all_results,columns=["phoneme1","phoneme2"
,"voice1","voice2","rate1","rate2","variant1","variant2","file1","file2",
"expected","predicted","success"])
result_data.to_csv('./outputs/' + audio_group + '.results.csv')
def play_results(audio_group='audio'):
result_data = pd.read_csv('./outputs/' + audio_group + '.results.csv')
play_file,close_player = file_player()
quit = False
for (i,r) in result_data.iterrows():
if quit:
break
keys = ["phoneme1","phoneme2","voice1","voice2","rate1","rate2","variant1","variant2"]
row_vals = [str(r[k]) for k in keys]
h_str = '\t'.join(keys)
row_str = '\t'.join(row_vals)
while True:
print(h_str)
print(row_str)
play_file('./outputs/'+audio_group+'/'+r['file1'],True)
play_file('./outputs/'+audio_group+'/'+r['file2'],True)
a = input("press 'r/q/[Enter]' to replay/quit/continue:\t")
if a == 'r':
continue
if a == 'q':
quit = True
break
else:
break
close_player()
if __name__ == '__main__':
evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_words',weights ='siamese_speech_model-712-epoch-0.00-acc.h5')
# evaluate_siamese('./outputs/story_words.test.tfrecords',audio_group='story_words',weights ='siamese_speech_model-675-epoch-0.00-acc.h5')
# play_results('story_words')
# test_with('rand_edu')
# sunflower_data,sunflower_result = get_word_pairs_data('sweater',15)
# print(np.argmax(model.predict([sunflower_data[:, 0], sunflower_data[:, 1]]),axis=1))
# print(sunflower_result)

152
speech_tools.py Normal file
View File

@ -0,0 +1,152 @@
import os
import threading
import multiprocessing
import pandas as pd
import numpy as np
import pyaudio
from pysndfile import sndio as snd
# from matplotlib import pyplot as plt
from speech_spectrum import plot_stft, generate_spec_frec
SAMPLE_RATE = 22050
N_CHANNELS = 2
def file_player():
p_oup = pyaudio.PyAudio()
def play_file(audiopath,plot=False):
print('playing',audiopath)
samples, samplerate, form = snd.read(audiopath)
stream = p_oup.open(
format=pyaudio.paFloat32,
channels=2,
rate=samplerate,
output=True)
one_channel = np.asarray([samples, samples]).T.reshape(-1)
audio_data = one_channel.astype(np.float32).tobytes()
stream.write(audio_data)
stream.close()
if plot:
plot_stft(samples, SAMPLE_RATE)
def close_player():
p_oup.terminate()
return play_file,close_player
def record_spectrogram(n_sec, plot=False, playback=False):
# show_record_prompt()
N_SEC = n_sec
CHUNKSIZE = int(SAMPLE_RATE * N_SEC / N_CHANNELS) # fixed chunk size
input('Press [Enter] to start recording sample... ')
p_inp = pyaudio.PyAudio()
stream = p_inp.open(
format=pyaudio.paFloat32,
channels=N_CHANNELS,
rate=SAMPLE_RATE,
input=True,
frames_per_buffer=CHUNKSIZE)
data = stream.read(CHUNKSIZE)
numpydata = np.frombuffer(data, dtype=np.float32)
multi_channel = np.abs(np.reshape(numpydata, (-1, 2))).mean(axis=1)
one_channel = np.asarray([multi_channel, -1 * multi_channel]).T.reshape(-1)
mean_channel_data = one_channel.tobytes()
stream.stop_stream()
stream.close()
p_inp.terminate()
if plot:
plot_stft(one_channel, SAMPLE_RATE)
if playback:
p_oup = pyaudio.PyAudio()
stream = p_oup.open(
format=pyaudio.paFloat32,
channels=2,
rate=SAMPLE_RATE,
output=True)
stream.write(mean_channel_data)
stream.close()
p_oup.terminate()
ims, _ = generate_spec_frec(one_channel, SAMPLE_RATE)
return ims
def _apply_df(args):
df, func, num, kwargs = args
return num, df.apply(func, **kwargs)
def apply_by_multiprocessing(df,func,**kwargs):
cores = multiprocessing.cpu_count()
workers=kwargs.pop('workers') if 'workers' in kwargs else cores
pool = multiprocessing.Pool(processes=workers)
result = pool.map(_apply_df, [(d, func, i, kwargs) for i,d in enumerate(np.array_split(df, workers))])
pool.close()
result=sorted(result,key=lambda x:x[0])
return pd.concat([i[1] for i in result])
def square(x):
return x**x
if __name__ == '__main__':
df = pd.DataFrame({'a':range(10), 'b':range(10)})
apply_by_multiprocessing(df, square, axis=1, workers=4)
def rm_rf(d):
for path in (os.path.join(d,f) for f in os.listdir(d)):
if os.path.isdir(path):
rm_rf(path)
else:
os.unlink(path)
os.rmdir(d)
def create_dir(direc):
if not os.path.exists(direc):
os.makedirs(direc)
else:
rm_rf(direc)
create_dir(direc)
#################### Now make the data generator threadsafe ####################
class threadsafe_iter:
"""Takes an iterator/generator and makes it thread-safe by
serializing call to the `next` method of given iterator/generator.
"""
def __init__(self, it):
self.it = it
self.lock = threading.Lock()
def __iter__(self):
return self
def __next__(self): # Py3
with self.lock:
return next(self.it)
def next(self): # Py2
with self.lock:
return self.it.next()
def threadsafe_generator(f):
"""A decorator that takes a generator function and makes it thread-safe.
"""
def g(*a, **kw):
return threadsafe_iter(f(*a, **kw))
return g
def format_filename(s):
"""
Take a string and return a valid filename constructed from the string.
Uses a whitelist approach: any characters not present in valid_chars are
removed. Also spaces are replaced with underscores.
Note: this method may produce invalid filenames such as ``, `.` or `..`
When I use this method I prepend a date string like '2009_01_15_19_46_32_'
and append a file extension like '.txt', so I avoid the potential of using
an invalid filename.
"""
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
filename = ''.join(c for c in s if c in valid_chars)
filename = filename.replace(' ','_') # I don't like spaces in filenames.
return filename

View File

@ -1,30 +0,0 @@
from speech_siamese import siamese_model
from record_mic_speech import record_spectrogram
from importlib import reload
# import speech_data
# reload(speech_data)
from speech_data import create_test_pair,get_word_pairs_data,speech_data
import numpy as np
model = siamese_model((15, 1654))
model.load_weights('./models/siamese_speech_model-final.h5')
def predict_recording_with(m,sample_size=15):
spec1 = record_spectrogram(n_sec=1.4)
spec2 = record_spectrogram(n_sec=1.4)
inp = create_test_pair(spec1,spec2,sample_size)
return m.predict([inp[:, 0], inp[:, 1]])
# while(True):
# print(predict_recording_with(model))
def test_with(audio_group):
X,Y = speech_data(audio_group)
print(np.argmax(model.predict([X[:, 0], X[:, 1]]),axis=1))
print(Y.astype(np.int8))
test_with('rand_edu')
# sunflower_data,sunflower_result = get_word_pairs_data('sweater',15)
# print(np.argmax(model.predict([sunflower_data[:, 0], sunflower_data[:, 1]]),axis=1))
# print(sunflower_result)