Compare commits

..

20 Commits

Author SHA1 Message Date
Malar Kannan
225a720f18 updated README to include testing 2017-12-29 16:21:38 +05:30
Malar Kannan
b267b89a44 Merge branch 'master' of /home/ilml/Public/Repos/speech_scoring 2017-12-29 13:15:51 +05:30
Malar Kannan
eb10b577ae Added README.md describing the workflow 2017-12-29 13:14:37 +05:30
Malar Kannan
ee2eb63f66 Merge branch 'master' of ssh://invnuc/~/Public/Repos/speech_scoring 2017-12-28 20:02:44 +05:30
Malar Kannan
2ae269d939 generating test for phone seg model 2017-12-28 20:01:44 +05:30
Malar Kannan
40d7933870 saving model on better 'acc' 2017-12-28 20:00:19 +05:30
Malar Kannan
4dd4bb5963 implemented phoneme segmented training on samples 2017-12-28 18:53:54 +05:30
Malar Kannan
0600482fe5 generating segmentation for words 2017-12-28 13:37:27 +05:30
Malar Kannan
507da49cfa added voicerss tts support for test data generation 2017-12-26 14:32:56 +05:30
Malar Kannan
f44665e9b2 1. fixed softmax output and overfit the model for small sample
2. updated to run on complete data
2017-12-12 12:18:27 +05:30
Malar Kannan
cc4fbe45b9 trying to overfit 2 samples with model -> doesn't seem to converge 2017-12-11 15:03:14 +05:30
Malar Kannan
8d550c58cc fixed batch normalization layer before activation 2017-12-11 14:33:56 +05:30
Malar Kannan
240ecb3f27 removed bn output layer 2017-12-11 14:12:23 +05:30
Malar Kannan
05242d5991 added batch normalization 2017-12-11 14:09:04 +05:30
Malar Kannan
fea9184aec using the full data and fixed typo in model layer name 2017-12-11 13:47:30 +05:30
Malar Kannan
a6543491f8 fixed empty phoneme boundary case 2017-12-11 13:05:46 +05:30
Malar Kannan
d387922f7d added dense-relu/softmax layers to segment output 2017-12-11 12:30:08 +05:30
Malar Kannan
52bbb69c65 resuming segment training 2017-12-10 21:58:55 +05:30
Malar Kannan
03edd935ea fixed input_dim 2017-12-07 17:16:05 +05:30
Malar Kannan
a7f1451a7f fixed exception in data generation 2017-12-07 16:49:34 +05:30
16 changed files with 448 additions and 59 deletions

2
CLI.md Normal file
View File

@@ -0,0 +1,2 @@
### Convert audio files
$ `for f in *.mp3; do ffmpeg -i "$f" "${f%.mp3}.aiff"; done`

23
README.md Normal file
View File

@@ -0,0 +1,23 @@
### Setup
`. env/bin/activate` to activate the virtualenv.
### Data Generation
* update `OUTPUT_NAME` in *speech_samplegen.py* to create the dataset folder with the name
* `python speech_samplegen.py` generates variants of audio samples
### Data Preprocessing
* `python speech_data.py` creates the training-testing data from the generated samples.
* run `fix_csv(OUTPUT_NAME)` once to create the fixed index of the dataset generated
* run `generate_sppas_trans(OUTPUT_NAME)` once to create the SPPAS transcription(wav+txt) data
* run `$ (SPPAS_DIR)/bin/annotation.py -l eng -e csv --ipus --tok --phon --align --align -w ./outputs/OUTPUT_NAME/` once to create the phoneme alignment csv files for all variants.
* `create_seg_phonpair_tfrecords(OUTPUT_NAME)` creates the tfrecords files
with the phoneme level pairs of right/wrong stresses
### Training
* `python speech_model.py` trains the model with the training data generated.
* `train_siamese(OUTPUT_NAME)` trains the siamese model with the generated dataset.
### Testing
* `python speech_test.py` tests the trained model with the test dataset
* `evaluate_siamese(TEST_RECORD_FILE,audio_group=OUTPUT_NAME,weights = WEIGHTS_FILE_NAME)`
the TEST_RECORD_FILE will be under outputs directory and WEIGHTS_FILE_NAME will be under the models directory, pick the most recent weights file.

View File

@@ -113,7 +113,9 @@ def plot_segments(collection_name = 'story_test_segments'):
def generate_spec(aiff_file): def generate_spec(aiff_file):
phrase_sample = pm_snd(aiff_file) phrase_sample = pm_snd(aiff_file)
phrase_spec = phrase_sample.to_spectrogram(window_length=SPEC_WINDOW_SIZE, maximum_frequency=SPEC_MAX_FREQUENCY) phrase_spec = phrase_sample.to_spectrogram(window_length=SPEC_WINDOW_SIZE, maximum_frequency=SPEC_MAX_FREQUENCY)
sg_db = 10 * np.log10(phrase_spec.values) sshow_abs = np.abs(phrase_spec.values + np.finfo(phrase_spec.values.dtype).eps)
sg_db = 10 * np.log10(sshow_abs)
sg_db[sg_db < 0] = 0
return sg_db,phrase_spec return sg_db,phrase_spec
@@ -142,6 +144,7 @@ def create_segments_tfrecords(collection_name='story_test_segments',sample_count
fname = g.iloc[0]['filename'] fname = g.iloc[0]['filename']
sg_db,phrase_spec = generate_spec(g.iloc[0]['file_path']) sg_db,phrase_spec = generate_spec(g.iloc[0]['file_path'])
phon_stops = [] phon_stops = []
phrase_groups.set_postfix(phrase=ph)
spec_n,spec_w = sg_db.shape spec_n,spec_w = sg_db.shape
spec = sg_db.reshape(-1) spec = sg_db.reshape(-1)
for (i,phon) in g.iterrows(): for (i,phon) in g.iterrows():
@@ -153,8 +156,8 @@ def create_segments_tfrecords(collection_name='story_test_segments',sample_count
f_bounds = [spec_frame(phrase_spec,b) for b in ph_bounds] f_bounds = [spec_frame(phrase_spec,b) for b in ph_bounds]
valid_bounds = [i for i in f_bounds if 0 < i < spec_n] valid_bounds = [i for i in f_bounds if 0 < i < spec_n]
b_frames = np.asarray(valid_bounds) b_frames = np.asarray(valid_bounds)
# print(spec_n,b_frames) if len(b_frames) > 0:
result[b_frames] = 1 result[b_frames] = 1
nonlocal n_records,n_spec,n_features nonlocal n_records,n_spec,n_features
n_spec = max([n_spec,spec_n]) n_spec = max([n_spec,spec_n])
n_features = spec_w n_features = spec_w
@@ -175,6 +178,7 @@ def create_segments_tfrecords(collection_name='story_test_segments',sample_count
word_groups = [i for i in audio_samples.groupby('phrase')] word_groups = [i for i in audio_samples.groupby('phrase')]
wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups
# write_samples(word_groups,'all')
tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=train_test_ratio) tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=train_test_ratio)
write_samples(tr_audio_samples,'train') write_samples(tr_audio_samples,'train')
write_samples(te_audio_samples,'test') write_samples(te_audio_samples,'test')
@@ -235,10 +239,8 @@ def read_segments_tfrecords_generator(collection_name='audio',batch_size=32,test
random_samples = enumerate(reservoir_sample(te_re_iterator,test_size)) random_samples = enumerate(reservoir_sample(te_re_iterator,test_size))
for (i,string_record) in tqdm(random_samples,total=test_size): for (i,string_record) in tqdm(random_samples,total=test_size):
# (i,string_record) = next(random_samples) # (i,string_record) = next(random_samples)
# string_record
example = tf.train.Example() example = tf.train.Example()
example.ParseFromString(string_record) example.ParseFromString(string_record)
# example.features.feature['spec'].float_list.value
spec_n = example.features.feature['spec_n'].int64_list.value[0] spec_n = example.features.feature['spec_n'].int64_list.value[0]
spec_w = example.features.feature['spec_w'].int64_list.value[0] spec_w = example.features.feature['spec_w'].int64_list.value[0]
spec = np.array(example.features.feature['spec'].float_list.value).reshape(spec_n,spec_w) spec = np.array(example.features.feature['spec'].float_list.value).reshape(spec_n,spec_w)
@@ -254,9 +256,9 @@ if __name__ == '__main__':
# plot_random_phrases() # plot_random_phrases()
# fix_csv('story_test_segments') # fix_csv('story_test_segments')
# plot_segments('story_test_segments') # plot_segments('story_test_segments')
# fix_csv('story_test') # fix_csv('story_words')
pass # pass
# create_segments_tfrecords('story_test') create_segments_tfrecords('story_words.30', sample_count=36,train_test_ratio=0.1)
# record_generator,input_data,output_data,copy_read_consts = read_segments_tfrecords_generator('story_test') # record_generator,input_data,output_data,copy_read_consts = read_segments_tfrecords_generator('story_test')
# tr_gen = record_generator() # tr_gen = record_generator()
# for i in tr_gen: # for i in tr_gen:

View File

@@ -4,10 +4,10 @@ import numpy as np
from keras.models import Model,load_model,model_from_yaml from keras.models import Model,load_model,model_from_yaml
from keras.layers import Input,Concatenate,Lambda, Reshape, Dropout from keras.layers import Input,Concatenate,Lambda, Reshape, Dropout
from keras.layers import Dense,Conv2D, LSTM, Bidirectional, GRU from keras.layers import Dense,Conv2D, LSTM, Bidirectional, GRU
from keras.layers import BatchNormalization from keras.layers import BatchNormalization,Activation
from keras.losses import categorical_crossentropy from keras.losses import categorical_crossentropy
from keras.utils import to_categorical from keras.utils import to_categorical
from keras.optimizers import RMSprop from keras.optimizers import RMSprop,Adadelta,Adagrad,Adam,Nadam
from keras.callbacks import TensorBoard, ModelCheckpoint from keras.callbacks import TensorBoard, ModelCheckpoint
from keras import backend as K from keras import backend as K
from keras.utils import plot_model from keras.utils import plot_model
@@ -36,32 +36,38 @@ def ctc_lambda_func(args):
return K.ctc_batch_cost(labels, y_pred, input_length, label_length) return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
def segment_model(input_dim): def segment_model(input_dim):
input_dim = (100,100,1)
inp = Input(shape=input_dim) inp = Input(shape=input_dim)
cnv1 = Conv2D(filters=32, kernel_size=(5,9))(inp) cnv1 = Conv2D(filters=32, kernel_size=(5,9))(inp)
cnv2 = Conv2D(filters=1, kernel_size=(5,9))(cnv1) cnv2 = Conv2D(filters=1, kernel_size=(5,9))(cnv1)
dr_cnv2 = Dropout(rate=0.95)(cnv2) dr_cnv2 = Dropout(rate=0.95)(cnv2)
# dr_cnv2
cn_rnn_dim = (dr_cnv2.shape[1].value,dr_cnv2.shape[2].value) cn_rnn_dim = (dr_cnv2.shape[1].value,dr_cnv2.shape[2].value)
r_dr_cnv2 = Reshape(target_shape=cn_rnn_dim)(dr_cnv2) r_dr_cnv2 = Reshape(target_shape=cn_rnn_dim)(dr_cnv2)
b_gr1 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(r_dr_cnv2) b_gr1 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(r_dr_cnv2)
# b_gr1
b_gr2 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(b_gr1) b_gr2 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(b_gr1)
b_gr3 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(b_gr2) b_gr3 = Bidirectional(GRU(512, return_sequences=True),merge_mode='sum')(b_gr2)
# b_gr3
oup = Dense(2, activation='softmax')(b_gr3) oup = Dense(2, activation='softmax')(b_gr3)
# oup
return Model(inp, oup) return Model(inp, oup)
def simple_segment_model(input_dim): def simple_segment_model(input_dim):
# input_dim = (100,100)
input_dim = (506,743)
inp = Input(shape=input_dim) inp = Input(shape=input_dim)
b_gr1 = Bidirectional(GRU(256, return_sequences=True),merge_mode='sum')(inp) b_gr1 = Bidirectional(LSTM(32, return_sequences=True))(inp)
# b_gr1 b_gr1 = Bidirectional(LSTM(16, return_sequences=True),merge_mode='sum')(b_gr1)
b_gr2 = Bidirectional(GRU(64, return_sequences=True),merge_mode='sum')(b_gr1) b_gr1 = LSTM(1, return_sequences=True,activation='softmax')(b_gr1)
b_gr3 = Bidirectional(GRU(1, return_sequences=True),merge_mode='sum')(b_gr2) # b_gr1 = LSTM(4, return_sequences=True)(b_gr1)
oup = Reshape(target_shape=(input_dim[0],))(b_gr3) # b_gr1 = LSTM(2, return_sequences=True)(b_gr1)
# bn_b_gr1 = BatchNormalization(momentum=0.98)(b_gr1)
# b_gr2 = GRU(64, return_sequences=True)(b_gr1)
# bn_b_gr2 = BatchNormalization(momentum=0.98)(b_gr2)
# d1 = Dense(32)(b_gr2)
# bn_d1 = BatchNormalization(momentum=0.98)(d1)
# bn_da1 = Activation('relu')(bn_d1)
# d2 = Dense(8)(bn_da1)
# bn_d2 = BatchNormalization(momentum=0.98)(d2)
# bn_da2 = Activation('relu')(bn_d2)
# d3 = Dense(1)(b_gr1)
# # bn_d3 = BatchNormalization(momentum=0.98)(d3)
# bn_da3 = Activation('softmax')(d3)
oup = Reshape(target_shape=(input_dim[0],))(b_gr1)
return Model(inp, oup) return Model(inp, oup)
def write_model_arch(mod,mod_file): def write_model_arch(mod,mod_file):
@@ -75,7 +81,7 @@ def load_model_arch(mod_file):
model_f.close() model_f.close()
return mod return mod
def train_segment(collection_name = 'test'): def train_segment(collection_name = 'test',resume_weights='',initial_epoch=0):
# collection_name = 'story_test' # collection_name = 'story_test'
batch_size = 128 batch_size = 128
# batch_size = 4 # batch_size = 4
@@ -101,28 +107,30 @@ def train_segment(collection_name = 'test'):
embeddings_freq=0, embeddings_freq=0,
embeddings_layer_names=None, embeddings_layer_names=None,
embeddings_metadata=None) embeddings_metadata=None)
cp_file_fmt = model_dir+'/siamese_speech_model-{epoch:02d}-epoch-{val_loss:0.2f}\ cp_file_fmt = model_dir+'/speech_segment_model-{epoch:02d}-epoch-{val_loss:0.2f}\
-acc.h5' -acc.h5'
cp_cb = ModelCheckpoint( cp_cb = ModelCheckpoint(
cp_file_fmt, cp_file_fmt,
monitor='val_loss', monitor='val_loss',
verbose=0, verbose=0,
save_best_only=True, save_best_only=False,
save_weights_only=True, save_weights_only=True,
mode='auto', mode='auto',
period=1) period=1)
# train # train
rms = RMSprop() opt = RMSprop()
model.compile(loss=categorical_crossentropy, optimizer=rms, metrics=[accuracy]) model.compile(loss=categorical_crossentropy, optimizer=opt, metrics=[accuracy])
write_model_arch(model,model_dir+'/siamese_speech_model_arch.yaml') write_model_arch(model,model_dir+'/speech_segment_model_arch.yaml')
epoch_n_steps = step_count(n_records,batch_size) epoch_n_steps = step_count(n_records,batch_size)
if resume_weights != '':
model.load_weights(resume_weights)
model.fit_generator(tr_gen model.fit_generator(tr_gen
, epochs=1000 , epochs=10000
, steps_per_epoch=epoch_n_steps , steps_per_epoch=epoch_n_steps
, validation_data=(te_x, te_y) , validation_data=(te_x, te_y)
, max_queue_size=32 , max_queue_size=32
, callbacks=[tb_cb, cp_cb]) , callbacks=[tb_cb, cp_cb],initial_epoch=initial_epoch)
model.save(model_dir+'/speech_segment_model-final.h5') model.save(model_dir+'/speech_segment_model-final.h5')
# y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]]) # y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
@@ -133,4 +141,4 @@ def train_segment(collection_name = 'test'):
if __name__ == '__main__': if __name__ == '__main__':
# pass # pass
train_segment('test') train_segment('story_words')#,'./models/segment/story_phrases.1000/speech_segment_model-final.h5',1001)

View File

@@ -1,27 +1,28 @@
import pandas as pd import pandas as pd
from speech_tools import apply_by_multiprocessing,threadsafe_iter,reservoir_sample,padd_zeros from speech_tools import *
from speech_pitch import *
# import dask as dd # import dask as dd
# import dask.dataframe as ddf # import dask.dataframe as ddf
import tensorflow as tf import tensorflow as tf
from tensorflow.python.ops import data_flow_ops from tensorflow.python.ops import data_flow_ops
import numpy as np import numpy as np
from speech_spectrum import generate_aiff_spectrogram from speech_spectrum import generate_aiff_spectrogram,generate_sample_spectrogram
from speech_pitch import pitch_array from speech_similar import segmentable_phoneme
from speech_pitch import compute_mfcc
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
import os,shutil import os,shutil
import random import random
import csv import csv
import gc import gc
import pickle import pickle
import itertools
from tqdm import tqdm from tqdm import tqdm
def siamese_pairs(rightGroup, wrongGroup): def siamese_pairs(rightGroup, wrongGroup):
group1 = [r for (i, r) in rightGroup.iterrows()] group1 = [r for (i, r) in rightGroup.iterrows()]
group2 = [r for (i, r) in wrongGroup.iterrows()] group2 = [r for (i, r) in wrongGroup.iterrows()]
rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]+[(g2, g1) for g2 in group2 for g1 in group1] rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]#+[(g2, g1) for g2 in group2 for g1 in group1]
rightRightPairs = [i for i in itertools.permutations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)] rightRightPairs = [i for i in itertools.combinations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)]
def filter_criteria(s1,s2): def filter_criteria(s1,s2):
same = s1['variant'] == s2['variant'] same = s1['variant'] == s2['variant']
phon_same = s1['phonemes'] == s2['phonemes'] phon_same = s1['phonemes'] == s2['phonemes']
@@ -38,6 +39,58 @@ def siamese_pairs(rightGroup, wrongGroup):
# return rightRightPairs[:10],rightWrongPairs[:10] # return rightRightPairs[:10],rightWrongPairs[:10]
return validRRPairs[:32],validRWPairs[:32] return validRRPairs[:32],validRWPairs[:32]
def seg_siamese_pairs(rightGroup, wrongGroup):
group1 = [r for (i, r) in rightGroup.iterrows()]
group2 = [r for (i, r) in wrongGroup.iterrows()]
rightWrongPairs = [(g1, g2) for g2 in group2 for g1 in group1]#+[(g2, g1) for g2 in group2 for g1 in group1]
rightRightPairs = [i for i in itertools.combinations(group1, 2)]#+[i for i in itertools.combinations(group2, 2)]
def filter_criteria(s1,s2):
same = s1['variant'] == s2['variant']
phon_same = s1['phonemes'] == s2['phonemes']
voice_diff = s1['voice'] != s2['voice']
if not same and phon_same:
return False
# if same and not voice_diff:
# return False
return True
validRWPairs = [i for i in rightWrongPairs if filter_criteria(*i)]
validRRPairs = [i for i in rightRightPairs if filter_criteria(*i)]
random.shuffle(validRWPairs)
random.shuffle(validRRPairs)
rrPhonePairs = []
rwPhonePairs = []
def compute_seg_spec(s1,s2):
phon_count = len(s1['parsed_phoneme'])
seg1_count = len(s1['segments'].index)
seg2_count = len(s2['segments'].index)
if phon_count == seg1_count and seg2_count == phon_count:
s1nd,s2nd = pm_snd(s1['file_path']),pm_snd(s2['file_path'])
segs1 = [tuple(x) for x in s1['segments'][['start','end']].values]
segs2 = [tuple(x) for x in s2['segments'][['start','end']].values]
s1_cp = pd.Series(s1)
s2_cp = pd.Series(s2)
pp12 = zip(s1['parsed_phoneme'],s2['parsed_phoneme'],segs1,segs2)
for (p1,p2,(s1s,s1e),(s2s,s2e)) in pp12:
spc1 = generate_sample_spectrogram(s1nd.extract_part(s1s,s1e).values)
spc2 = generate_sample_spectrogram(s2nd.extract_part(s2s,s2e).values)
s1_cp['spectrogram'] = spc1
s2_cp['spectrogram'] = spc2
# import pdb; pdb.set_trace()
if repr(p1) == repr(p2):
rrPhonePairs.append((s1_cp,s2_cp))
else:
rwPhonePairs.append((s1_cp,s2_cp))
for (s1,s2) in validRRPairs:
compute_seg_spec(s1,s2)
for (s1,s2) in validRWPairs:
compute_seg_spec(s1,s2)
return rrPhonePairs[:32],rwPhonePairs[:32]
# return rightRightPairs[:10],rightWrongPairs[:10]
# return
# validRRPairs[:8],validRWPairs[:8]
def _float_feature(value): def _float_feature(value):
return tf.train.Feature(float_list=tf.train.FloatList(value=value)) return tf.train.Feature(float_list=tf.train.FloatList(value=value))
@@ -64,8 +117,8 @@ def create_spectrogram_tfrecords(audio_group='audio',sample_count=0,train_test_r
for (w, word_group) in word_group_prog: for (w, word_group) in word_group_prog:
word_group_prog.set_postfix(word=w,sample_name=sample_name) word_group_prog.set_postfix(word=w,sample_name=sample_name)
g = word_group.reset_index() g = word_group.reset_index()
g['spectrogram'] = apply_by_multiprocessing(g['file_path'],pitch_array) # g['spectrogram'] = apply_by_multiprocessing(g['file_path'],pitch_array)
# g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram) g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram)
# g['spectrogram'] = apply_by_multiprocessing(g['file_path'],compute_mfcc) # g['spectrogram'] = apply_by_multiprocessing(g['file_path'],compute_mfcc)
sample_right = g.loc[g['variant'] == 'low'] sample_right = g.loc[g['variant'] == 'low']
sample_wrong = g.loc[g['variant'] == 'medium'] sample_wrong = g.loc[g['variant'] == 'medium']
@@ -226,6 +279,94 @@ def convert_old_audio():
audio_samples = audio_samples[['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']] audio_samples = audio_samples[['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']]
audio_samples.to_csv('./outputs/audio_new.csv',index=False,header=False) audio_samples.to_csv('./outputs/audio_new.csv',index=False,header=False)
def generate_sppas_trans(audio_group='story_words.all'):
# audio_group='story_words.all'
audio_samples = pd.read_csv( './outputs/' + audio_group + '.fixed.csv',index_col=0)
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
# audio_samples = audio_samples.head(5)
rows = tqdm(audio_samples.iterrows(),total = len(audio_samples.index)
, desc='Transcribing Words ')
for (i,row) in rows:
# len(audio_samples.iterrows())
# (i,row) = next(audio_samples.iterrows())
rows.set_postfix(word=row['word'])
transribe_audio_text(row['file_path'],row['word'])
rows.close()
def create_seg_phonpair_tfrecords(audio_group='story_words.all',sample_count=0,train_test_ratio=0.1):
audio_samples = pd.read_csv( './outputs/' + audio_group + '.fixed.csv',index_col=0)
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
audio_samples = audio_samples[(audio_samples['variant'] == 'low') | (audio_samples['variant'] == 'medium')]
audio_samples['parsed_phoneme'] = apply_by_multiprocessing(audio_samples['phonemes'],segmentable_phoneme)
# audio_samples['sound'] = apply_by_multiprocessing(audio_samples['file_path'],pm_snd)
# read_seg_file(audio_samples.iloc[0]['file_path'])
audio_samples['segments'] = apply_by_multiprocessing(audio_samples['file_path'],read_seg_file)
n_records,n_spec,n_features = 0,0,0
def write_samples(wg,sample_name):
word_group_prog = tqdm(wg,desc='Computing PhonPair spectrogram')
record_file = './outputs/{}.{}.tfrecords'.format(audio_group,sample_name)
writer = tf.python_io.TFRecordWriter(record_file)
for (w, word_group) in word_group_prog:
word_group_prog.set_postfix(word=w,sample_name=sample_name)
g = word_group.reset_index()
# g['spectrogram'] = apply_by_multiprocessing(g['file_path'],pitch_array)
# g['spectrogram'] = apply_by_multiprocessing(g['file_path'],generate_aiff_spectrogram)
# g['spectrogram'] = apply_by_multiprocessing(g['file_path'],compute_mfcc)
sample_right = g.loc[g['variant'] == 'low']
sample_wrong = g.loc[g['variant'] == 'medium']
same, diff = seg_siamese_pairs(sample_right, sample_wrong)
groups = [([0,1],same),([1,0],diff)]
for (output,group) in groups:
group_prog = tqdm(group,desc='Writing Spectrogram')
for sample1,sample2 in group_prog:
group_prog.set_postfix(output=output
,var1=sample1['variant']
,var2=sample2['variant'])
spectro1,spectro2 = sample1['spectrogram'],sample2['spectrogram']
spec_n1,spec_n2 = spectro1.shape[0],spectro2.shape[0]
spec_w1,spec_w2 = spectro1.shape[1],spectro2.shape[1]
spec1,spec2 = spectro1.reshape(-1),spectro2.reshape(-1)
nonlocal n_spec,n_records,n_features
n_spec = max([n_spec,spec_n1,spec_n2])
n_features = spec_w1
n_records+=1
example = tf.train.Example(features=tf.train.Features(
feature={
'word': _bytes_feature([w.encode('utf-8')]),
'phoneme1': _bytes_feature([sample1['phonemes'].encode('utf-8')]),
'phoneme2': _bytes_feature([sample2['phonemes'].encode('utf-8')]),
'voice1': _bytes_feature([sample1['voice'].encode('utf-8')]),
'voice2': _bytes_feature([sample2['voice'].encode('utf-8')]),
'language': _bytes_feature([sample1['language'].encode('utf-8')]),
'rate1':_int64_feature([sample1['rate']]),
'rate2':_int64_feature([sample2['rate']]),
'variant1': _bytes_feature([sample1['variant'].encode('utf-8')]),
'variant2': _bytes_feature([sample2['variant'].encode('utf-8')]),
'file1': _bytes_feature([sample1['file'].encode('utf-8')]),
'file2': _bytes_feature([sample2['file'].encode('utf-8')]),
'spec1':_float_feature(spec1),
'spec2':_float_feature(spec2),
'spec_n1':_int64_feature([spec_n1]),
'spec_w1':_int64_feature([spec_w1]),
'spec_n2':_int64_feature([spec_n2]),
'spec_w2':_int64_feature([spec_w2]),
'output':_int64_feature(output)
}
))
writer.write(example.SerializeToString())
group_prog.close()
word_group_prog.close()
writer.close()
word_groups = [i for i in audio_samples.groupby('word')]
wg_sampled = reservoir_sample(word_groups,sample_count) if sample_count > 0 else word_groups
tr_audio_samples,te_audio_samples = train_test_split(wg_sampled,test_size=train_test_ratio)
write_samples(tr_audio_samples,'train')
write_samples(te_audio_samples,'test')
const_file = os.path.join('./outputs',audio_group+'.constants')
pickle.dump((n_spec,n_features,n_records),open(const_file,'wb'))
if __name__ == '__main__': if __name__ == '__main__':
# sunflower_pairs_data() # sunflower_pairs_data()
# create_spectrogram_data() # create_spectrogram_data()
@@ -240,8 +381,10 @@ if __name__ == '__main__':
# create_spectrogram_tfrecords('audio',sample_count=100) # create_spectrogram_tfrecords('audio',sample_count=100)
# create_spectrogram_tfrecords('story_all',sample_count=25) # create_spectrogram_tfrecords('story_all',sample_count=25)
# fix_csv('story_words_test') # fix_csv('story_words_test')
#fix_csv('audio') # fix_csv('test_5_words')
create_spectrogram_tfrecords('story_words_pitch',sample_count=0,train_test_ratio=0.1) # generate_sppas_trans('test_5_words')
create_seg_phonpair_tfrecords('test_5_words')
# create_spectrogram_tfrecords('story_words.all',sample_count=0,train_test_ratio=0.1)
#record_generator_count() #record_generator_count()
# create_spectrogram_tfrecords('audio',sample_count=50) # create_spectrogram_tfrecords('audio',sample_count=50)
# read_siamese_tfrecords_generator('audio') # read_siamese_tfrecords_generator('audio')

View File

@@ -103,7 +103,7 @@ def train_siamese(audio_group = 'audio',resume_weights='',initial_epoch=0):
cp_cb = ModelCheckpoint( cp_cb = ModelCheckpoint(
cp_file_fmt, cp_file_fmt,
monitor='val_loss', monitor='acc',
verbose=0, verbose=0,
save_best_only=True, save_best_only=True,
save_weights_only=True, save_weights_only=True,
@@ -117,7 +117,7 @@ def train_siamese(audio_group = 'audio',resume_weights='',initial_epoch=0):
if resume_weights != '': if resume_weights != '':
model.load_weights(resume_weights) model.load_weights(resume_weights)
model.fit_generator(tr_gen model.fit_generator(tr_gen
, epochs=1000 , epochs=10000
, steps_per_epoch=epoch_n_steps , steps_per_epoch=epoch_n_steps
, validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y) , validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y)
, max_queue_size=8 , max_queue_size=8
@@ -131,5 +131,4 @@ def train_siamese(audio_group = 'audio',resume_weights='',initial_epoch=0):
if __name__ == '__main__': if __name__ == '__main__':
train_siamese('story_words_pitch') train_siamese('test_5_words')

View File

@@ -30,7 +30,7 @@ def compute_mfcc(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.a
return sample_mfcc.to_array() return sample_mfcc.to_array()
def compute_formants(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'): def compute_formants(sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'):
sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff' # sample_file='outputs/audio/sunflowers-Victoria-180-normal-870.aiff'
sample_sound = pm_snd(sample_file) sample_sound = pm_snd(sample_file)
sample_formant = sample_sound.to_formant_burg() sample_formant = sample_sound.to_formant_burg()
# sample_formant.x_bins() # sample_formant.x_bins()

View File

@@ -12,9 +12,9 @@ import time
from tqdm import tqdm from tqdm import tqdm
from generate_similar import similar_phoneme_phrase,similar_phrase from generate_similar import similar_phoneme_phrase,similar_phrase
from speech_tools import hms_string,create_dir,format_filename from speech_tools import hms_string,create_dir,format_filename,reservoir_sample
OUTPUT_NAME = 'story_phrases' OUTPUT_NAME = 'test_5_words'
dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/' dest_dir = os.path.abspath('.') + '/outputs/' + OUTPUT_NAME + '/'
dest_file = './outputs/' + OUTPUT_NAME + '.csv' dest_file = './outputs/' + OUTPUT_NAME + '.csv'
@@ -216,6 +216,9 @@ def generate_audio_for_text_list(text_list):
closer() closer()
def generate_audio_for_stories(): def generate_audio_for_stories():
'''
Generates the audio sample variants for the list of words in the stories
'''
# story_file = './inputs/all_stories_hs.json' # story_file = './inputs/all_stories_hs.json'
story_file = './inputs/all_stories.json' story_file = './inputs/all_stories.json'
stories_data = json.load(open(story_file)) stories_data = json.load(open(story_file))
@@ -224,7 +227,11 @@ def generate_audio_for_stories():
text_list = sorted(list(set(text_list_dup))) text_list = sorted(list(set(text_list_dup)))
generate_audio_for_text_list(text_list) generate_audio_for_text_list(text_list)
def generate_test_audio_for_stories(): def generate_test_audio_for_stories(sample_count=0):
'''
Picks a list of words from the wordlist that are not in story words
and generates the variants
'''
story_file = './inputs/all_stories_hs.json' story_file = './inputs/all_stories_hs.json'
# story_file = './inputs/all_stories.json' # story_file = './inputs/all_stories.json'
stories_data = json.load(open(story_file)) stories_data = json.load(open(story_file))
@@ -234,11 +241,12 @@ def generate_test_audio_for_stories():
word_list = [i.strip('\n_') for i in open('./inputs/wordlist.txt','r').readlines()] word_list = [i.strip('\n_') for i in open('./inputs/wordlist.txt','r').readlines()]
text_set = set(text_list) text_set = set(text_list)
new_word_list = [i for i in word_list if i not in text_set and len(i) > 4] new_word_list = [i for i in word_list if i not in text_set and len(i) > 4]
test_words = new_word_list[:int(len(text_list)/5+1)] # test_words = new_word_list[:int(len(text_list)/5+1)]
test_words = reservoir_sample(new_word_list,sample_count) if sample_count > 0 else new_word_list
generate_audio_for_text_list(test_words) generate_audio_for_text_list(test_words)
if __name__ == '__main__': if __name__ == '__main__':
# generate_test_audio_for_stories() generate_test_audio_for_stories(5)
# generate_audio_for_text_list(['I want to go home','education']) # generate_audio_for_text_list(['I want to go home','education'])
generate_audio_for_stories() # generate_audio_for_stories()

View File

@@ -57,7 +57,8 @@ class Delegate (NSObject):
'''Called automatically when the application has launched''' '''Called automatically when the application has launched'''
print("App Launched!") print("App Launched!")
# phrases = story_texts()#random.sample(story_texts(), 100) # # phrases = story_texts()#random.sample(story_texts(), 100) #
phrases = test_texts(30) # phrases = test_texts(30)
phrases = story_words()
# print(phrases) # print(phrases)
generate_audio(phrases) generate_audio(phrases)
@@ -174,14 +175,19 @@ class SynthesizerQueue(object):
def story_texts(): def story_texts():
# story_file = './inputs/all_stories_hs.json'
story_file = './inputs/all_stories.json' story_file = './inputs/all_stories.json'
stories_data = json.load(open(story_file)) stories_data = json.load(open(story_file))
# text_list_dup = [t[0] for i in stories_data.values() for t in i]
text_list_dup = [t for i in stories_data.values() for t in i] text_list_dup = [t for i in stories_data.values() for t in i]
text_list = sorted(list(set(text_list_dup))) text_list = sorted(list(set(text_list_dup)))
return text_list return text_list
def story_words():
story_file = './inputs/all_stories_hs.json'
stories_data = json.load(open(story_file))
text_list_dup = [t[0] for i in stories_data.values() for t in i]
text_list = sorted(list(set(text_list_dup)))
return text_list
def test_texts(count=10): def test_texts(count=10):
word_list = [i.strip('\n_') for i in open('./inputs/wordlist.txt','r').readlines()] word_list = [i.strip('\n_') for i in open('./inputs/wordlist.txt','r').readlines()]
text_list = sorted(random.sample(list(set(word_list)),count)) text_list = sorted(random.sample(list(set(word_list)),count))

View File

@@ -120,9 +120,11 @@ def parse_apple_phonemes(ph_str):
elif pref[0].isdigit() and pref[1:] in apple_phonemes: elif pref[0].isdigit() and pref[1:] in apple_phonemes:
return [ApplePhoneme(pref[1:], int(pref[0]) , True)] + parse_apple_phonemes(rest) return [ApplePhoneme(pref[1:], int(pref[0]) , True)] + parse_apple_phonemes(rest)
elif not pref.isalnum(): elif not pref.isalnum():
return [ApplePhoneme(pref, 0, False)] + parse_apple_phonemes(rest) return [ApplePhoneme(pref, -1, False)] + parse_apple_phonemes(rest)
return [] return []
def segmentable_phoneme(ph_str):
return [p for p in parse_apple_phonemes(ph_str) if p.stress >=0]
def similar_phoneme_word(ph_str): def similar_phoneme_word(ph_str):
phons = parse_apple_phonemes(ph_str) phons = parse_apple_phonemes(ph_str)

View File

@@ -79,6 +79,9 @@ def generate_spec_frec(samples, samplerate):
ims[ims < 0] = 0 #np.finfo(sshow.dtype).eps ims[ims < 0] = 0 #np.finfo(sshow.dtype).eps
return ims, freq return ims, freq
def generate_sample_spectrogram(samples):
ims, _ = generate_spec_frec(samples, 22050)
return ims
def generate_aiff_spectrogram(audiopath): def generate_aiff_spectrogram(audiopath):
samples, samplerate, _ = snd.read(audiopath) samples, samplerate, _ = snd.read(audiopath)

View File

@@ -1,5 +1,5 @@
from speech_model import load_model_arch from speech_model import load_model_arch
from speech_tools import record_spectrogram, file_player from speech_tools import record_spectrogram, file_player, padd_zeros, pair_for_word
from speech_data import record_generator_count from speech_data import record_generator_count
# from importlib import reload # from importlib import reload
# import speech_data # import speech_data
@@ -20,6 +20,21 @@ def predict_recording_with(m,sample_size=15):
inp = create_test_pair(spec1,spec2,sample_size) inp = create_test_pair(spec1,spec2,sample_size)
return m.predict([inp[:, 0], inp[:, 1]]) return m.predict([inp[:, 0], inp[:, 1]])
def predict_tts_sample(sample_word = 'able',audio_group='story_words',weights = 'siamese_speech_model-153-epoch-0.55-acc.h5'):
# sample_word = 'able';audio_group='story_words';weights = 'siamese_speech_model-153-epoch-0.55-acc.h5'
const_file = './models/'+audio_group+'/constants.pkl'
arch_file='./models/'+audio_group+'/siamese_speech_model_arch.yaml'
weight_file='./models/'+audio_group+'/'+weights
(sample_size,n_features,n_records) = pickle.load(open(const_file,'rb'))
model = load_model_arch(arch_file)
model.load_weights(weight_file)
spec1,spec2 = pair_for_word(sample_word)
p_spec1 = padd_zeros(spec1,sample_size)
p_spec2 = padd_zeros(spec2,sample_size)
inp = np.array([[p_spec1,p_spec2]])
result = model.predict([inp[:, 0], inp[:, 1]])[0]
res_str = 'same' if result[0] < result[1] else 'diff'
return res_str
def test_with(audio_group): def test_with(audio_group):
X,Y = speech_data(audio_group) X,Y = speech_data(audio_group)
@@ -177,7 +192,7 @@ def visualize_results(audio_group='audio'):
if __name__ == '__main__': if __name__ == '__main__':
# evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_words.gpu',weights ='siamese_speech_model-58-epoch-0.00-acc.h5') # evaluate_siamese('./outputs/story_words_test.train.tfrecords',audio_group='story_words.gpu',weights ='siamese_speech_model-58-epoch-0.00-acc.h5')
# evaluate_siamese('./outputs/story_words.test.tfrecords',audio_group='story_words',weights ='siamese_speech_model-675-epoch-0.00-acc.h5') # evaluate_siamese('./outputs/story_words.test.tfrecords',audio_group='story_words',weights ='siamese_speech_model-675-epoch-0.00-acc.h5')
evaluate_siamese('./outputs/story_words_pitch.test.tfrecords',audio_group='story_words_pitch',weights ='siamese_speech_model-867-epoch-0.12-acc.h5') evaluate_siamese('./outputs/story_words.test.tfrecords',audio_group='story_words',weights ='siamese_speech_model-153-epoch-0.55-acc.h5')
# play_results('story_words') # play_results('story_words')
#inspect_tfrecord('./outputs/story_phrases.test.tfrecords',audio_group='story_phrases') #inspect_tfrecord('./outputs/story_phrases.test.tfrecords',audio_group='story_phrases')
# visualize_results('story_words.gpu') # visualize_results('story_words.gpu')

50
speech_testgen.py Normal file
View File

@@ -0,0 +1,50 @@
import voicerss_tts
import json
from speech_tools import format_filename
def generate_voice(phrase):
voice = voicerss_tts.speech({
'key': '0ae89d82aa78460691c99a4ac8c0f9ec',
'hl': 'en-us',
'src': phrase,
'r': '0',
'c': 'mp3',
'f': '22khz_16bit_mono',
'ssml': 'false',
'b64': 'false'
})
if not voice['error']:
return voice[b'response']
return None
def generate_test_audio_for_stories():
story_file = './inputs/all_stories_hs.json'
# story_file = './inputs/all_stories.json'
stories_data = json.load(open(story_file))
text_list_dup = [t[0] for i in stories_data.values() for t in i]
text_list = sorted(list(set(text_list_dup)))[:10]
for t in text_list:
v = generate_voice(t)
if v:
f_name = format_filename(t)
tf = open('inputs/voicerss/'+f_name+'.mp3','wb')
tf.write(v)
tf.close()
# def generate_test_audio_for(records_file,audio_group='audio'):
# # audio_group='audio';model_file = 'siamese_speech_model-305-epoch-0.20-acc.h5'
# # records_file = os.path.join('./outputs',eval_group+'.train.tfrecords')
# const_file = os.path.join('./models/'+audio_group+'/','constants.pkl')
# (n_spec,n_features,n_records) = pickle.load(open(const_file,'rb'))
# print('evaluating {}...'.format(records_file))
# record_iterator,records_count = record_generator_count(records_file)
# all_results = []
# for (i,string_record) in tqdm(enumerate(record_iterator),total=records_count):
# total+=1
# example = tf.train.Example()
# example.ParseFromString(string_record)
# word = example.features.feature['word'].bytes_list.value[0].decode()
# audio = generate_voice('hello world')
# audio

View File

@@ -5,16 +5,19 @@ import threading
import itertools import itertools
import random import random
import multiprocessing import multiprocessing
import subprocess
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import pyaudio import pyaudio
from pysndfile import sndio as snd from pysndfile import sndio as snd
# from matplotlib import pyplot as plt # from matplotlib import pyplot as plt
from speech_spectrum import plot_stft, generate_spec_frec from speech_spectrum import plot_stft, generate_spec_frec,generate_aiff_spectrogram
SAMPLE_RATE = 22050 SAMPLE_RATE = 22050
N_CHANNELS = 2 N_CHANNELS = 2
devnull = open(os.devnull, 'w')
def step_count(n_records,batch_size): def step_count(n_records,batch_size):
return int(math.ceil(n_records*1.0/batch_size)) return int(math.ceil(n_records*1.0/batch_size))
@@ -56,6 +59,13 @@ def padd_zeros(spgr, max_samples):
return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)], return np.lib.pad(spgr, [(0, max_samples - spgr.shape[0]), (0, 0)],
'constant') 'constant')
def read_seg_file(aiff_name):
base_name = aiff_name.rsplit('.aiff',1)[0]
seg_file = base_name+'-palign.csv'
seg_data = pd.read_csv(seg_file,names=['action','start','end','phoneme'])
seg_data = seg_data[(seg_data['action'] == 'PhonAlign') & (seg_data['phoneme'] != '#')]
return seg_data
def record_spectrogram(n_sec, plot=False, playback=False): def record_spectrogram(n_sec, plot=False, playback=False):
# show_record_prompt() # show_record_prompt()
N_SEC = n_sec N_SEC = n_sec
@@ -91,6 +101,20 @@ def record_spectrogram(n_sec, plot=False, playback=False):
ims, _ = generate_spec_frec(one_channel, SAMPLE_RATE) ims, _ = generate_spec_frec(one_channel, SAMPLE_RATE)
return ims return ims
def pair_for_word(phrase='able'):
spec1 = generate_aiff_spectrogram('./inputs/pairs/good/'+phrase+'.aiff')
spec2 = generate_aiff_spectrogram('./inputs/pairs/test/'+phrase+'.aiff')
return spec1,spec2
def transribe_audio_text(aiff_name,phrase):
base_name = aiff_name.rsplit('.aiff',1)[0]
wav_name = base_name+'.wav'
txt_name = base_name+'.txt'
params = ['ffmpeg', '-y', '-i',aiff_name,wav_name]
subprocess.call(params,stdout=devnull,stderr=devnull)
trcr_f = open(txt_name,'w')
trcr_f.write(phrase)
trcr_f.close()
def _apply_df(args): def _apply_df(args):
df, func, num, kwargs = args df, func, num, kwargs = args

52
voicerss_tts.py Normal file
View File

@@ -0,0 +1,52 @@
import http.client, urllib.request, urllib.parse, urllib.error
def speech(settings):
__validate(settings)
return __request(settings)
def __validate(settings):
if not settings: raise RuntimeError('The settings are undefined')
if 'key' not in settings or not settings['key']: raise RuntimeError('The API key is undefined')
if 'src' not in settings or not settings['src']: raise RuntimeError('The text is undefined')
if 'hl' not in settings or not settings['hl']: raise RuntimeError('The language is undefined')
def __request(settings):
result = {'error': None, 'response': None}
headers = {'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}
params = urllib.parse.urlencode(__buildRequest(settings))
if 'ssl' in settings and settings['ssl']:
conn = http.client.HTTPSConnection('api.voicerss.org:443')
else:
conn = http.client.HTTPConnection('api.voicerss.org:80')
conn.request('POST', '/', params, headers)
response = conn.getresponse()
content = response.read()
if response.status != 200:
result[b'error'] = response.reason
elif content.find(b'ERROR') == 0:
result[b'error'] = content
else:
result[b'response'] = content
conn.close()
return result
def __buildRequest(settings):
params = {'key': '', 'src': '', 'hl': '', 'r': '', 'c': '', 'f': '', 'ssml': '', 'b64': ''}
if 'key' in settings: params['key'] = settings['key']
if 'src' in settings: params['src'] = settings['src']
if 'hl' in settings: params['hl'] = settings['hl']
if 'r' in settings: params['r'] = settings['r']
if 'c' in settings: params['c'] = settings['c']
if 'f' in settings: params['f'] = settings['f']
if 'ssml' in settings: params['ssml'] = settings['ssml']
if 'b64' in settings: params['b64'] = settings['b64']
return params

52
voicerss_tts.py.bak Normal file
View File

@@ -0,0 +1,52 @@
import httplib, urllib
def speech(settings):
__validate(settings)
return __request(settings)
def __validate(settings):
if not settings: raise RuntimeError('The settings are undefined')
if 'key' not in settings or not settings['key']: raise RuntimeError('The API key is undefined')
if 'src' not in settings or not settings['src']: raise RuntimeError('The text is undefined')
if 'hl' not in settings or not settings['hl']: raise RuntimeError('The language is undefined')
def __request(settings):
result = {'error': None, 'response': None}
headers = {'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}
params = urllib.urlencode(__buildRequest(settings))
if 'ssl' in settings and settings['ssl']:
conn = httplib.HTTPSConnection('api.voicerss.org:443')
else:
conn = httplib.HTTPConnection('api.voicerss.org:80')
conn.request('POST', '/', params, headers)
response = conn.getresponse()
content = response.read()
if response.status != 200:
result['error'] = response.reason
elif content.find('ERROR') == 0:
result['error'] = content
else:
result['response'] = content
conn.close()
return result
def __buildRequest(settings):
params = {'key': '', 'src': '', 'hl': '', 'r': '', 'c': '', 'f': '', 'ssml': '', 'b64': ''}
if 'key' in settings: params['key'] = settings['key']
if 'src' in settings: params['src'] = settings['src']
if 'hl' in settings: params['hl'] = settings['hl']
if 'r' in settings: params['r'] = settings['r']
if 'c' in settings: params['c'] = settings['c']
if 'f' in settings: params['f'] = settings['f']
if 'ssml' in settings: params['ssml'] = settings['ssml']
if 'b64' in settings: params['b64'] = settings['b64']
return params