dropping invalid csv entries
parent
55e2de2f04
commit
41b3f1a9fe
|
|
@ -26,7 +26,7 @@ def siamese_pairs(rightGroup, wrongGroup):
|
|||
random.shuffle(rightRightPairs)
|
||||
# return (random.sample(same,10), random.sample(diff,10))
|
||||
# return rightRightPairs[:10],rightWrongPairs[:10]
|
||||
return rightRightPairs[:32],rightWrongPairs[:32]
|
||||
return rightRightPairs[:16],rightWrongPairs[:16]
|
||||
# return rightRightPairs,rightWrongPairs
|
||||
|
||||
def create_spectrogram_tfrecords(audio_group='audio'):
|
||||
|
|
@ -42,9 +42,9 @@ def create_spectrogram_tfrecords(audio_group='audio'):
|
|||
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
|
||||
audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists)
|
||||
audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index()
|
||||
audio_samples['rate_int'] = apply_by_multiprocessing(audio_samples['rate'], str.isdigit)
|
||||
audio_samples = audio_samples[audio_samples['rate_int'] == True].reset_index().drop(['level_0'],axis=1)
|
||||
audio_samples['rate'] = audio_samples['rate'].astype(int)
|
||||
# audio_samples['rate_int'] = apply_by_multiprocessing(audio_samples['rate'], str.isdigit)
|
||||
# audio_samples = audio_samples[audio_samples['rate_int'] == True].reset_index().drop(['level_0'],axis=1)
|
||||
# audio_samples['rate'] = audio_samples['rate'].astype(int)
|
||||
def _float_feature(value):
|
||||
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
|
||||
|
||||
|
|
@ -131,22 +131,26 @@ def audio_samples_word_count(audio_group='audio'):
|
|||
return len(audio_samples.groupby(audio_samples['word']))
|
||||
|
||||
def fix_csv(audio_group='audio'):
|
||||
audio_group = 'story_all'
|
||||
audio_samples = pd.read_csv( './outputs/story_words.csv'
|
||||
, names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']
|
||||
, quoting=csv.QUOTE_NONE)
|
||||
voice_set = set(audio_samples['voice'].unique().tolist())
|
||||
audio_csv_lines = open('./outputs/' + audio_group + '.csv','r').readlines()
|
||||
audio_csv_data = [i.strip().split(',') for i in audio_csv_lines]
|
||||
to_be_fixed = [i for i in audio_csv_data if len(i) > 7]
|
||||
def unite_words(entries):
|
||||
entries = to_be_fixed[0]
|
||||
word_entries = next(((entries[:i],entries[i:]) for (i,e) in enumerate(entries) if e in voice_set),'')
|
||||
word_entries[1]
|
||||
return
|
||||
to_be_fixed[0]
|
||||
entries = [unite_words for e in to_be_fixed]
|
||||
[i for i in entries if len(i) % 2 != 0]
|
||||
# audio_samples = pd.read_csv( './outputs/story_words.csv'
|
||||
# , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']
|
||||
# , quoting=csv.QUOTE_NONE)
|
||||
# voice_set = set(audio_samples['voice'].unique().tolist())
|
||||
# to_be_fixed = [i for i in audio_csv_data if len(i) > 7]
|
||||
# def unite_words(entries):
|
||||
# entries = to_be_fixed[0]
|
||||
# word_entries = next(((entries[:i],entries[i:]) for (i,e) in enumerate(entries) if e in voice_set),'')
|
||||
# word_entries[1]
|
||||
# return
|
||||
# to_be_fixed[0]
|
||||
# entries = [unite_words for e in to_be_fixed]
|
||||
# [i for i in entries if len(i) % 2 != 0]
|
||||
proper_rows = [i for i in audio_csv_data if len(i) == 7]
|
||||
with open('./outputs/' + audio_group + '-new.csv','w') as fixed_csv:
|
||||
fixed_csv_w = csv.writer(fixed_csv, quoting=csv.QUOTE_MINIMAL)
|
||||
fixed_csv_w.writerows(proper_rows)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# sunflower_pairs_data()
|
||||
|
|
|
|||
Loading…
Reference in New Issue