cleanup rev recycle

2026-03-08 02:22:34 +00:00 · 2020-05-27 15:33:22 +05:30
parent 1acf9e403c
commit 7ff2db3e2e
1 changed files with 3 additions and 70 deletions
--- a/jasper/data/rev_recycler.py
+++ b/jasper/data/rev_recycler.py
@@ -49,39 +49,6 @@ def extract_data(

        return filter_func

-    # def compute_endtime(call_wav, state):
-    #     for (i, st) in enumerate(state):
-    #         start_time = st["AsrResult"]["Alternatives"][0].get("StartTime", 0)
-    #         transcript = st["AsrResult"]["Alternatives"][0]["Transcript"]
-    #         if i + 1 < len(state):
-    #             end_time = state[i + 1]["AsrResult"]["Alternatives"][0]["StartTime"]
-    #         else:
-    #             end_time = call_wav.duration_seconds
-    #         full_code_seg = call_wav[start_time * 1000 : end_time * 1000]
-    #         code_seg = strip_silence(full_code_seg)
-    #         code_fb = BytesIO()
-    #         code_seg.export(code_fb, format="wav")
-    #         code_wav = code_fb.getvalue()
-    #         # only starting 1 min audio has reliable alignment
-    #         if start_time > 60:
-    #             break
-    #         # only of some audio data is present yield it
-    #         if code_seg.duration_seconds >= 0.5:
-    #             yield transcript, code_seg.duration_seconds, code_wav
-
-    # def generate_call_asr_data():
-    #     full_asr_data = []
-    #     total_duration = 0
-    #     for wav, wav_path, ev in wav_event_generator(call_audio_dir):
-    #         asr_data = asr_data_generator(wav, wav_path, ev)
-    #         total_duration += wav.duration_seconds
-    #         full_asr_data.append(asr_data)
-    #     typer.echo(f"loaded {len(full_asr_data)} calls of duration {total_duration}s")
-    #     n_dps = asr_data_writer(call_asr_data, dataset_name, chain(*full_asr_data))
-    #     typer.echo(f"written {n_dps} data points")
-
-    # generate_call_asr_data()
-
    def time_to_msecs(time_str):
        return (
            datetime.datetime.strptime(time_str, "%H:%M:%S,%f")
@@ -123,6 +90,8 @@ def extract_data(
            tscript_wav = tscript_wav_fb.getvalue()
            text = "".join(lens["elements"].Each()["value"].collect()(monologue))
            text_clean = re.sub(r"\[.*\]", "", text)
+            # only if some reasonable audio data is present yield it
+            if tscript_wav_seg.duration_seconds >= 0.5:
                yield text_clean, tscript_wav_seg.duration_seconds, tscript_wav

    def mono_asr_data_generator(wav_seg, wav_path, meta):
@@ -170,42 +139,6 @@ def extract_data(
        typer.echo(f"written {n_dps} data points")

    generate_rev_asr_data()
-    # DEBUG
-    # data = list(wav_event_generator(call_audio_dir))
-    # wav_seg, wav_path, meta = data[0]
-    # left_audio, right_audio = wav_seg.split_to_mono()
-    # channel_map = {"Agent": right_audio, "Client": left_audio}
-    # # data[0][2]['speakers']
-    # # data[0][1]
-    # monologues = lens["monologues"].Each().collect()(meta)
-    # for monologue in monologues:
-    #     # print(monologue["speaker_name"])
-    #     speaker_channel = channel_map.get(monologue["speaker_name"])
-    #     # monologue = monologues[0]
-    #     # monologue
-    #     start_time = (
-    #         lens["elements"]
-    #         .Each()
-    #         .Filter(lambda x: "timestamp" in x)["timestamp"]
-    #         .collect()(monologue)[0]
-    #     )
-    #     end_time = (
-    #         lens["elements"]
-    #         .Each()
-    #         .Filter(lambda x: "end_timestamp" in x)["end_timestamp"]
-    #         .collect()(monologue)[-1]
-    #     )
-    #     start_time, end_time
-    #
-    #     # offset by 500 msec to include first vad? discarded audio
-    #     speaker_channel[time_to_msecs(start_time) - 500 : time_to_msecs(end_time)]
-    #
-    #     # start_time = lens["elements"][0].get()(monologue)['timestamp']
-    #     # end_time = lens["elements"][-1].get()(monologue)['timestamp']
-    #     text = "".join(lens["elements"].Each()["value"].collect()(monologue))
-    #     text_clean = re.sub(r"\[.*\]", "", text)
-    #     # print(text)
-    #     # print(text_clean)


 def main():