respect verbose flag

Malar Kannan 2020-05-27 15:54:16 +05:30
parent 6f395af10d
commit 41af0a87de
2 changed files with 14 additions and 7 deletions

View File

@ -59,10 +59,12 @@ def extract_data(
code_wav = code_fb.getvalue() code_wav = code_fb.getvalue()
# only starting 1 min audio has reliable alignment ignore rest # only starting 1 min audio has reliable alignment ignore rest
if start_time > 60: if start_time > 60:
if verbose:
print(f'start time over 60 seconds of audio skipping.') print(f'start time over 60 seconds of audio skipping.')
break break
# only if some reasonable audio data is present yield it # only if some reasonable audio data is present yield it
if code_seg.duration_seconds < 0.5: if code_seg.duration_seconds < 0.5:
if verbose:
print(f'transcript chunk "{transcript}" contains no audio skipping.') print(f'transcript chunk "{transcript}" contains no audio skipping.')
continue continue
yield transcript, code_seg.duration_seconds, code_wav yield transcript, code_seg.duration_seconds, code_wav

View File

@ -63,6 +63,7 @@ def extract_data(
# print(monologue["speaker_name"]) # print(monologue["speaker_name"])
speaker_channel = channel_map.get(monologue["speaker_name"]) speaker_channel = channel_map.get(monologue["speaker_name"])
if not speaker_channel: if not speaker_channel:
if verbose:
print(f'unknown speaker tag {monologue["speaker_name"]} in wav:{wav_path} skipping.') print(f'unknown speaker tag {monologue["speaker_name"]} in wav:{wav_path} skipping.')
continue continue
try: try:
@ -79,6 +80,7 @@ def extract_data(
.collect()(monologue)[-1] .collect()(monologue)[-1]
) )
except IndexError: except IndexError:
if verbose:
print(f'error when loading timestamp events in wav:{wav_path} skipping.') print(f'error when loading timestamp events in wav:{wav_path} skipping.')
continue continue
@ -92,6 +94,7 @@ def extract_data(
text_clean = re.sub(r"\[.*\]", "", text) text_clean = re.sub(r"\[.*\]", "", text)
# only if some reasonable audio data is present yield it # only if some reasonable audio data is present yield it
if tscript_wav_seg.duration_seconds < 0.5: if tscript_wav_seg.duration_seconds < 0.5:
if verbose:
print(f'transcript chunk "{text_clean}" contains no audio in {wav_path} skipping.') print(f'transcript chunk "{text_clean}" contains no audio in {wav_path} skipping.')
continue continue
yield text_clean, tscript_wav_seg.duration_seconds, tscript_wav yield text_clean, tscript_wav_seg.duration_seconds, tscript_wav
@ -113,6 +116,7 @@ def extract_data(
.collect()(monologue)[-1] .collect()(monologue)[-1]
) )
except IndexError: except IndexError:
if verbose:
print(f'error when loading timestamp events in wav:{wav_path} skipping.') print(f'error when loading timestamp events in wav:{wav_path} skipping.')
continue continue
@ -125,6 +129,7 @@ def extract_data(
text = "".join(lens["elements"].Each()["value"].collect()(monologue)) text = "".join(lens["elements"].Each()["value"].collect()(monologue))
text_clean = re.sub(r"\[.*\]", "", text) text_clean = re.sub(r"\[.*\]", "", text)
if tscript_wav_seg.duration_seconds < 0.5: if tscript_wav_seg.duration_seconds < 0.5:
if verbose:
print(f'transcript chunk "{text_clean}" contains no audio in {wav_path} skipping.') print(f'transcript chunk "{text_clean}" contains no audio in {wav_path} skipping.')
continue continue
yield text_clean, tscript_wav_seg.duration_seconds, tscript_wav yield text_clean, tscript_wav_seg.duration_seconds, tscript_wav