diff --git a/jasper/data/asr_recycler.py b/jasper/data/asr_recycler.py index cc38f8b..57e0bc6 100644 --- a/jasper/data/asr_recycler.py +++ b/jasper/data/asr_recycler.py @@ -59,10 +59,13 @@ def extract_data( code_wav = code_fb.getvalue() # only starting 1 min audio has reliable alignment ignore rest if start_time > 60: + print(f'start time over 60 seconds of audio skipping.') break # only if some reasonable audio data is present yield it - if code_seg.duration_seconds >= 0.5: - yield transcript, code_seg.duration_seconds, code_wav + if code_seg.duration_seconds < 0.5: + print(f'transcript chunk "{transcript}" contains no audio skipping.') + continue + yield transcript, code_seg.duration_seconds, code_wav def asr_data_generator(call_wav, call_wav_fname, events): call_wav_0, call_wav_1 = call_wav.split_to_mono() diff --git a/jasper/data/rev_recycler.py b/jasper/data/rev_recycler.py index 9e45bbd..c83204a 100644 --- a/jasper/data/rev_recycler.py +++ b/jasper/data/rev_recycler.py @@ -91,8 +91,10 @@ def extract_data( text = "".join(lens["elements"].Each()["value"].collect()(monologue)) text_clean = re.sub(r"\[.*\]", "", text) # only if some reasonable audio data is present yield it - if tscript_wav_seg.duration_seconds >= 0.5: - yield text_clean, tscript_wav_seg.duration_seconds, tscript_wav + if tscript_wav_seg.duration_seconds < 0.5: + print(f'transcript chunk "{text_clean}" contains no audio in {wav_path} skipping.') + continue + yield text_clean, tscript_wav_seg.duration_seconds, tscript_wav def mono_asr_data_generator(wav_seg, wav_path, meta): monologues = lens["monologues"].Each().collect()(meta) @@ -122,6 +124,9 @@ def extract_data( tscript_wav = tscript_wav_fb.getvalue() text = "".join(lens["elements"].Each()["value"].collect()(monologue)) text_clean = re.sub(r"\[.*\]", "", text) + if tscript_wav_seg.duration_seconds < 0.5: + print(f'transcript chunk "{text_clean}" contains no audio in {wav_path} skipping.') + continue yield text_clean, tscript_wav_seg.duration_seconds, tscript_wav def generate_rev_asr_data():