diff --git a/jasper/data/rev_recycler.py b/jasper/data/rev_recycler.py index 6871505..9e45bbd 100644 --- a/jasper/data/rev_recycler.py +++ b/jasper/data/rev_recycler.py @@ -49,39 +49,6 @@ def extract_data( return filter_func - # def compute_endtime(call_wav, state): - # for (i, st) in enumerate(state): - # start_time = st["AsrResult"]["Alternatives"][0].get("StartTime", 0) - # transcript = st["AsrResult"]["Alternatives"][0]["Transcript"] - # if i + 1 < len(state): - # end_time = state[i + 1]["AsrResult"]["Alternatives"][0]["StartTime"] - # else: - # end_time = call_wav.duration_seconds - # full_code_seg = call_wav[start_time * 1000 : end_time * 1000] - # code_seg = strip_silence(full_code_seg) - # code_fb = BytesIO() - # code_seg.export(code_fb, format="wav") - # code_wav = code_fb.getvalue() - # # only starting 1 min audio has reliable alignment - # if start_time > 60: - # break - # # only of some audio data is present yield it - # if code_seg.duration_seconds >= 0.5: - # yield transcript, code_seg.duration_seconds, code_wav - - # def generate_call_asr_data(): - # full_asr_data = [] - # total_duration = 0 - # for wav, wav_path, ev in wav_event_generator(call_audio_dir): - # asr_data = asr_data_generator(wav, wav_path, ev) - # total_duration += wav.duration_seconds - # full_asr_data.append(asr_data) - # typer.echo(f"loaded {len(full_asr_data)} calls of duration {total_duration}s") - # n_dps = asr_data_writer(call_asr_data, dataset_name, chain(*full_asr_data)) - # typer.echo(f"written {n_dps} data points") - - # generate_call_asr_data() - def time_to_msecs(time_str): return ( datetime.datetime.strptime(time_str, "%H:%M:%S,%f") @@ -123,7 +90,9 @@ def extract_data( tscript_wav = tscript_wav_fb.getvalue() text = "".join(lens["elements"].Each()["value"].collect()(monologue)) text_clean = re.sub(r"\[.*\]", "", text) - yield text_clean, tscript_wav_seg.duration_seconds, tscript_wav + # only if some reasonable audio data is present yield it + if tscript_wav_seg.duration_seconds >= 0.5: + yield text_clean, tscript_wav_seg.duration_seconds, tscript_wav def mono_asr_data_generator(wav_seg, wav_path, meta): monologues = lens["monologues"].Each().collect()(meta) @@ -170,42 +139,6 @@ def extract_data( typer.echo(f"written {n_dps} data points") generate_rev_asr_data() - # DEBUG - # data = list(wav_event_generator(call_audio_dir)) - # wav_seg, wav_path, meta = data[0] - # left_audio, right_audio = wav_seg.split_to_mono() - # channel_map = {"Agent": right_audio, "Client": left_audio} - # # data[0][2]['speakers'] - # # data[0][1] - # monologues = lens["monologues"].Each().collect()(meta) - # for monologue in monologues: - # # print(monologue["speaker_name"]) - # speaker_channel = channel_map.get(monologue["speaker_name"]) - # # monologue = monologues[0] - # # monologue - # start_time = ( - # lens["elements"] - # .Each() - # .Filter(lambda x: "timestamp" in x)["timestamp"] - # .collect()(monologue)[0] - # ) - # end_time = ( - # lens["elements"] - # .Each() - # .Filter(lambda x: "end_timestamp" in x)["end_timestamp"] - # .collect()(monologue)[-1] - # ) - # start_time, end_time - # - # # offset by 500 msec to include first vad? discarded audio - # speaker_channel[time_to_msecs(start_time) - 500 : time_to_msecs(end_time)] - # - # # start_time = lens["elements"][0].get()(monologue)['timestamp'] - # # end_time = lens["elements"][-1].get()(monologue)['timestamp'] - # text = "".join(lens["elements"].Each()["value"].collect()(monologue)) - # text_clean = re.sub(r"\[.*\]", "", text) - # # print(text) - # # print(text_clean) def main():