dropping invalid csv entries

master
Malar Kannan 2017-11-07 12:43:17 +05:30
parent 55e2de2f04
commit 41b3f1a9fe
1 changed files with 22 additions and 18 deletions

View File

@ -26,7 +26,7 @@ def siamese_pairs(rightGroup, wrongGroup):
random.shuffle(rightRightPairs)
# return (random.sample(same,10), random.sample(diff,10))
# return rightRightPairs[:10],rightWrongPairs[:10]
return rightRightPairs[:32],rightWrongPairs[:32]
return rightRightPairs[:16],rightWrongPairs[:16]
# return rightRightPairs,rightWrongPairs
def create_spectrogram_tfrecords(audio_group='audio'):
@ -42,9 +42,9 @@ def create_spectrogram_tfrecords(audio_group='audio'):
audio_samples['file_path'] = audio_samples.loc[:, 'file'].apply(lambda x: 'outputs/' + audio_group + '/' + x)
audio_samples['file_exists'] = apply_by_multiprocessing(audio_samples['file_path'], os.path.exists)
audio_samples = audio_samples[audio_samples['file_exists'] == True].reset_index()
audio_samples['rate_int'] = apply_by_multiprocessing(audio_samples['rate'], str.isdigit)
audio_samples = audio_samples[audio_samples['rate_int'] == True].reset_index().drop(['level_0'],axis=1)
audio_samples['rate'] = audio_samples['rate'].astype(int)
# audio_samples['rate_int'] = apply_by_multiprocessing(audio_samples['rate'], str.isdigit)
# audio_samples = audio_samples[audio_samples['rate_int'] == True].reset_index().drop(['level_0'],axis=1)
# audio_samples['rate'] = audio_samples['rate'].astype(int)
def _float_feature(value):
return tf.train.Feature(float_list=tf.train.FloatList(value=value))
@ -131,22 +131,26 @@ def audio_samples_word_count(audio_group='audio'):
return len(audio_samples.groupby(audio_samples['word']))
def fix_csv(audio_group='audio'):
audio_group = 'story_all'
audio_samples = pd.read_csv( './outputs/story_words.csv'
, names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']
, quoting=csv.QUOTE_NONE)
voice_set = set(audio_samples['voice'].unique().tolist())
audio_csv_lines = open('./outputs/' + audio_group + '.csv','r').readlines()
audio_csv_data = [i.strip().split(',') for i in audio_csv_lines]
to_be_fixed = [i for i in audio_csv_data if len(i) > 7]
def unite_words(entries):
entries = to_be_fixed[0]
word_entries = next(((entries[:i],entries[i:]) for (i,e) in enumerate(entries) if e in voice_set),'')
word_entries[1]
return
to_be_fixed[0]
entries = [unite_words for e in to_be_fixed]
[i for i in entries if len(i) % 2 != 0]
# audio_samples = pd.read_csv( './outputs/story_words.csv'
# , names=['word','phonemes', 'voice', 'language', 'rate', 'variant', 'file']
# , quoting=csv.QUOTE_NONE)
# voice_set = set(audio_samples['voice'].unique().tolist())
# to_be_fixed = [i for i in audio_csv_data if len(i) > 7]
# def unite_words(entries):
# entries = to_be_fixed[0]
# word_entries = next(((entries[:i],entries[i:]) for (i,e) in enumerate(entries) if e in voice_set),'')
# word_entries[1]
# return
# to_be_fixed[0]
# entries = [unite_words for e in to_be_fixed]
# [i for i in entries if len(i) % 2 != 0]
proper_rows = [i for i in audio_csv_data if len(i) == 7]
with open('./outputs/' + audio_group + '-new.csv','w') as fixed_csv:
fixed_csv_w = csv.writer(fixed_csv, quoting=csv.QUOTE_MINIMAL)
fixed_csv_w.writerows(proper_rows)
if __name__ == '__main__':
# sunflower_pairs_data()