implemented phoneme segmented training on samples

master
Malar Kannan 2017-12-28 18:53:54 +05:30
parent 507da49cfa
commit 4dd4bb5963
7 changed files with 177 additions and 10 deletions

View File

@ -114,7 +114,7 @@ def train_segment(collection_name = 'test',resume_weights='',initial_epoch=0):
cp_file_fmt, cp_file_fmt,
monitor='val_loss', monitor='val_loss',
verbose=0, verbose=0,
save_best_only=True, save_best_only=False,
save_weights_only=True, save_weights_only=True,
mode='auto', mode='auto',
period=1) period=1)

View File

@ -1,13 +1,13 @@
import pandas as pd import pandas as pd
from speech_tools import apply_by_multiprocessing,threadsafe_iter,reservoir_sample,padd_zeros from speech_tools import *
from speech_pitch import *
# import dask as dd # import dask as dd
# import dask.dataframe as ddf # import dask.dataframe as ddf
import tensorflow as tf import tensorflow as tf
from tensorflow.python.ops import data_flow_ops from tensorflow.python.ops import data_flow_ops
import numpy as np import numpy as np
from speech_spectrum import generate_aiff_spectrogram from speech_spectrum import generate_aiff_spectrogram,generate_sample_spectrogram
from speech_pitch import pitch_array from speech_similar import segmentable_phoneme
from speech_pitch import compute_mfcc
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
import os,shutil import os,shutil
import random import random
@ -39,6 +39,58 @@ def siamese_pairs(rightGroup, wrongGroup):
# return rightRightPairs[:10],rightWrongPairs[:10] # return rightRightPairs[:10],rightWrongPairs[:10]
return validRRPairs[:32],validRWPairs[:32] return validRRPairs[:32],validRWPairs[:32]
def seg_siamese_pairs(rightGroup, wrongGroup):
group1 = [r for (i, r) in rightGroup.iterrows()]
group2 = [r for (i, r) in wrongGroup.iterrows()]
rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]#+[(g2, g1) for g2 in group2 for g1 in group1]
rightRightPairs = [i for i in itertools.combinations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)]
def filter_criteria(s1,s2):
same = s1['variant'] == s2['variant']
phon_same = s1['phonemes'] == s2['phonemes']
voice_diff = s1['voice'] != s2['voice']
if not same and phon_same:
return False
# if same and not voice_diff:
# return False
return True
validRWPairs = [i for i in rightWrongPairs if filter_criteria(*i)]
validRRPairs = [i for i in rightRightPairs if filter_criteria(*i)]
random.shuffle(validRWPairs)
random.shuffle(validRRPairs)
rrPhonePairs = []
rwPhonePairs = []
def compute_seg_spec(s1,s2):
phon_count = len(s1['parsed_phoneme'])
seg1_count = len(s1['segments'].index)
seg2_count = len(s2['segments'].index)
if phon_count == seg1_count and seg2_count == phon_count:
s1nd,s2nd = pm_snd(s1['file_path']),pm_snd(s2['file_path'])
segs1 = [tuple(x) for x in s1['segments'][['start','end']].values]
segs2 = [tuple(x) for x in s2['segments'][['start','end']].values]
s1_cp = pd.Series(s1)
s2_cp = pd.Series(s2)
pp12 = zip(s1['parsed_phoneme'],s2['parsed_phoneme'],segs1,segs2)
for (p1,p2,(s1s,s1e),(s2s,s2e)) in pp12:
spc1 = generate_sample_spectrogram(s1nd.extract_part(s1s,s1e).values)
spc2 = generate_sample_spectrogram(s2nd.extract_part(s2s,s2e).values)
s1_cp['spectrogram'] = spc1
s2_cp['spectrogram'] = spc2
# import pdb; pdb.set_trace()
if repr(p1) == repr(p2):
rrPhonePairs.append((s1_cp,s2_cp))
else:
rwPhonePairs.append((s1_cp,s2_cp))
for (s1,s2) in validRRPairs:
compute_seg_spec(s1,s2)
for (s1,s2) in validRWPairs:
compute_seg_spec(s1,s2)
return rrPhonePairs[:32],rwPhonePairs[:32]
# return rightRightPairs[:10],rightWrongPairs[:10]
# return
# validRRPairs[:8],validRWPairs[:8]
def _float_feature(value): def _float_feature(value):
return tf.train.Feature(float_list=tf.train.FloatList(value=value)) return tf.train.Feature(float_list=tf.train.FloatList(value=value))
@ -227,6 +279,94 @@ def convert_old_audio():
audio_samples = audio_samples[['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']] audio_samples = audio_samples[['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']]
audio_samples.to_csv('./outputs/audio_new.csv',index=False,header=False) audio_samples.to_csv('./outputs/audio_new.csv',index=False,header=False)
def generate_sppas_trans(audio_group='story_words.all'):
# audio_group='story_words.all'
audio_samples = pd.read_csv( './outputs/' + audio_group + '.fixed.csv',index_col=0)
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
# audio_samples = audio_samples.head(5)
rows = tqdm(audio_samples.iterrows(),total = len(audio_samples.index)
, desc='Transcribing Words ')
for (i,row) in rows:
# len(audio_samples.iterrows())
# (i,row) = next(audio_samples.iterrows())
rows.set_postfix(word=row['word'])
transribe_audio_text(row['file_path'],row['word'])
rows.close()
def create_seg_phonpair_tfrecords(audio_group='story_words.all',sample_count=0,train_test_ratio=0.1):
audio_samples = pd.read_csv( './outputs/' + audio_group + '.fixed.csv',index_col=0)
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
audio_samples = audio_samples[(audio_samples['variant'] == 'low') | (audio_samples['variant'] == 'medium')]
audio_samples['parsed_phoneme'] = apply_by_multiprocessing(audio_samples['phonemes'],segmentable_phoneme)
# audio_samples['sound'] = apply_by_multiprocessing(audio_samples['file_path'],pm_snd)
# read_seg_file(audio_samples.iloc[0]['file_path'])
audio_samples['segments'] = apply_by_multiprocessing(audio_samples['file_path'],read_seg_file)
n_records,n_spec,n_features = 0,0,0
def write_samples(wg,sample_name):
word_group_prog = tqdm(wg,desc='Computing PhonPair spectrogram')
record_file = './outputs/{}.{}.tfrecords'.format(audio_group,sample_name)
writer = tf.python_io.TFRecordWriter(record_file)
for (w, word_group) in word_group_prog:
word_group_prog.set_postfix(word=w,sample_name=sample_name)
g = word_group.reset_index()
# g['spectrogram'] = apply_by_multiprocessing(g['file_path'],pitch_array)
# g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram)
# g['spectrogram'] = apply_by_multiprocessing(g['file_path'],compute_mfcc)
sample_right = g.loc[g['variant'] == 'low']
sample_wrong = g.loc[g['variant'] == 'medium']
same, diff = seg_siamese_pairs(sample_right, sample_wrong)
groups = [([0,1],same),([1,0],diff)]
for (output,group) in groups:
group_prog = tqdm(group,desc='Writing Spectrogram')
for sample1,sample2 in group_prog:
group_prog.set_postfix(output=output
,var1=sample1['variant']
,var2=sample2['variant'])
spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram']
spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0]
spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1]
spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1)
nonlocal n_spec,n_records,n_features
n_spec = max([n_spec,spec_n1,spec_n2])
n_features = spec_w1
n_records+=1
example = tf.train.Example(features=tf.train.Features(
feature={
'word': _bytes_feature([w.encode('utf-8')]),
'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]),
'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]),
'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]),
'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]),
'language': _bytes_feature([sample1['language'].encode('utf-8')]),
'rate1':_int64_feature([sample1['rate']]),
'rate2':_int64_feature([sample2['rate']]),
'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]),
'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]),
'file1': _bytes_feature([sample1['file'].encode('utf-8')]),
'file2': _bytes_feature([sample2['file'].encode('utf-8')]),
'spec1':_float_feature(spec1),
'spec2':_float_feature(spec2),
'spec_n1':_int64_feature([spec_n1]),
'spec_w1':_int64_feature([spec_w1]),
'spec_n2':_int64_feature([spec_n2]),
'spec_w2':_int64_feature([spec_w2]),
'output':_int64_feature(output)
}
))
writer.write(example.SerializeToString())
group_prog.close()
word_group_prog.close()
writer.close()
word_groups = [i for i in audio_samples.groupby('word')]
wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups
tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=train_test_ratio)
write_samples(tr_audio_samples,'train')
write_samples(te_audio_samples,'test')
const_file = os.path.join('./outputs',audio_group+'.constants')
pickle.dump((n_spec,n_features,n_records),open(const_file,'wb'))
if __name__ == '__main__': if __name__ == '__main__':
# sunflower_pairs_data() # sunflower_pairs_data()
# create_spectrogram_data() # create_spectrogram_data()
@ -241,8 +381,10 @@ if __name__ == '__main__':
# create_spectrogram_tfrecords('audio',sample_count=100) # create_spectrogram_tfrecords('audio',sample_count=100)
# create_spectrogram_tfrecords('story_all',sample_count=25) # create_spectrogram_tfrecords('story_all',sample_count=25)
# fix_csv('story_words_test') # fix_csv('story_words_test')
# fix_csv('story_words') # fix_csv('test_5_words')
create_spectrogram_tfrecords('story_words',sample_count=100,train_test_ratio=0.1) # generate_sppas_trans('test_5_words')
create_seg_phonpair_tfrecords('test_5_words')
# create_spectrogram_tfrecords('story_words.all',sample_count=0,train_test_ratio=0.1)
#record_generator_count() #record_generator_count()
# create_spectrogram_tfrecords('audio',sample_count=50) # create_spectrogram_tfrecords('audio',sample_count=50)
# read_siamese_tfrecords_generator('audio') # read_siamese_tfrecords_generator('audio')

View File

@ -131,4 +131,4 @@ def train_siamese(audio_group = 'audio',resume_weights='',initial_epoch=0):
if __name__ == '__main__': if __name__ == '__main__':
train_siamese('story_words') train_siamese('test_5_words')

View File

@ -30,7 +30,7 @@ def compute_mfcc(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.a
return sample_mfcc.to_array() return sample_mfcc.to_array()
def compute_formants(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'): def compute_formants(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff' # sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'
sample_sound = pm_snd(sample_file) sample_sound = pm_snd(sample_file)
sample_formant = sample_sound.to_formant_burg() sample_formant = sample_sound.to_formant_burg()
# sample_formant.x_bins() # sample_formant.x_bins()

View File

@ -120,9 +120,11 @@ def parse_apple_phonemes(ph_str):
elif pref[0].isdigit() and pref[1:] in apple_phonemes: elif pref[0].isdigit() and pref[1:] in apple_phonemes:
return [ApplePhoneme(pref[1:], int(pref[0]) , True)] + parse_apple_phonemes(rest) return [ApplePhoneme(pref[1:], int(pref[0]) , True)] + parse_apple_phonemes(rest)
elif not pref.isalnum(): elif not pref.isalnum():
return [ApplePhoneme(pref, 0, False)] + parse_apple_phonemes(rest) return [ApplePhoneme(pref, -1, False)] + parse_apple_phonemes(rest)
return [] return []
def segmentable_phoneme(ph_str):
return [p for p in parse_apple_phonemes(ph_str) if p.stress >=0]
def similar_phoneme_word(ph_str): def similar_phoneme_word(ph_str):
phons = parse_apple_phonemes(ph_str) phons = parse_apple_phonemes(ph_str)

View File

@ -79,6 +79,9 @@ def generate_spec_frec(samples, samplerate):
ims[ims < 0] = 0 #np.finfo(sshow.dtype).eps ims[ims < 0] = 0 #np.finfo(sshow.dtype).eps
return ims, freq return ims, freq
def generate_sample_spectrogram(samples):
ims, _ = generate_spec_frec(samples, 22050)
return ims
def generate_aiff_spectrogram(audiopath): def generate_aiff_spectrogram(audiopath):
samples, samplerate, _ = snd.read(audiopath) samples, samplerate, _ = snd.read(audiopath)

View File

@ -5,6 +5,7 @@ import threading
import itertools import itertools
import random import random
import multiprocessing import multiprocessing
import subprocess
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import pyaudio import pyaudio
@ -15,6 +16,8 @@ from speech_spectrum import plot_stft, generate_spec_frec,generate_aiff_spectrog
SAMPLE_RATE = 22050 SAMPLE_RATE = 22050
N_CHANNELS = 2 N_CHANNELS = 2
devnull = open(os.devnull, 'w')
def step_count(n_records,batch_size): def step_count(n_records,batch_size):
return int(math.ceil(n_records*1.0/batch_size)) return int(math.ceil(n_records*1.0/batch_size))
@ -56,6 +59,13 @@ def padd_zeros(spgr, max_samples):
return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
'constant') 'constant')
def read_seg_file(aiff_name):
base_name = aiff_name.rsplit('.aiff',1)[0]
seg_file = base_name+'-palign.csv'
seg_data = pd.read_csv(seg_file,names=['action','start','end','phoneme'])
seg_data = seg_data[(seg_data['action'] == 'PhonAlign') & (seg_data['phoneme'] != '#')]
return seg_data
def record_spectrogram(n_sec, plot=False, playback=False): def record_spectrogram(n_sec, plot=False, playback=False):
# show_record_prompt() # show_record_prompt()
N_SEC = n_sec N_SEC = n_sec
@ -96,6 +106,16 @@ def pair_for_word(phrase='able'):
spec2 = generate_aiff_spectrogram('./inputs/pairs/test/'+phrase+'.aiff') spec2 = generate_aiff_spectrogram('./inputs/pairs/test/'+phrase+'.aiff')
return spec1,spec2 return spec1,spec2
def transribe_audio_text(aiff_name,phrase):
base_name = aiff_name.rsplit('.aiff',1)[0]
wav_name = base_name+'.wav'
txt_name = base_name+'.txt'
params = ['ffmpeg', '-y', '-i',aiff_name,wav_name]
subprocess.call(params,stdout=devnull,stderr=devnull)
trcr_f = open(txt_name,'w')
trcr_f.write(phrase)
trcr_f.close()
def _apply_df(args): def _apply_df(args):
df, func, num, kwargs = args df, func, num, kwargs = args
return num, df.apply(func, **kwargs) return num, df.apply(func, **kwargs)