diff --git a/jasper/data/call_recycler.py b/jasper/data/call_recycler.py index 7b08d90..881159b 100644 --- a/jasper/data/call_recycler.py +++ b/jasper/data/call_recycler.py @@ -121,6 +121,7 @@ def analyze( ), start_delay: float = 1.5, download_only: bool = False, + strip_silent_chunks: bool = True, call_logs_file: Path = typer.Option(Path("./call_logs.yaml"), show_default=True), output_dir: Path = Path("./data"), data_name: str = None, @@ -408,6 +409,16 @@ def analyze( pprint(call_plots) def extract_data_points(): + if strip_silent_chunks: + + def audio_process(seg): + return strip_silence(seg) + + else: + + def audio_process(seg): + return seg + def gen_data_values(saved_wav_path, data_points, caller_name): call_seg = ( AudioSegment.from_wav(saved_wav_path) @@ -417,12 +428,15 @@ def analyze( ) for dp_id, dp in enumerate(data_points): start, end, spoken = dp["start_time"], dp["end_time"], dp["code"] - spoken_seg = strip_silence(call_seg[start * 1000 : end * 1000]) + spoken_seg = audio_process(call_seg[start * 1000 : end * 1000]) spoken_fb = BytesIO() spoken_seg.export(spoken_fb, format="wav") spoken_wav = spoken_fb.getvalue() # search for actual pnr code and handle plain codes as well extracted_code = text_extractor(spoken) + if strip_silent_chunks and spoken_seg.duration_seconds < 0.5: + print(f'transcript chunk "{spoken}" contains no audio skipping.') + continue yield extracted_code, spoken_seg.duration_seconds, spoken_wav, caller_name, spoken_seg call_lens = lens["users"].Each()["calls"].Each() diff --git a/jasper/data/validation/process.py b/jasper/data/validation/process.py index 7013577..a8b111c 100644 --- a/jasper/data/validation/process.py +++ b/jasper/data/validation/process.py @@ -194,6 +194,7 @@ def dump_corrections( @app.command() def caller_quality( + task_uid: str, data_name: str = typer.Option("call_upwork_train_cnd", show_default=True), dump_dir: Path = Path("./data/asr_data"), dump_fname: Path = Path("ui_dump.json"), @@ -214,7 +215,11 @@ def caller_quality( dp["valid"] = c["value"]["status"] == "Correct" return dp - corrected_dump = [correction_dp(c) for c in correction_data] + corrected_dump = [ + correction_dp(c) + for c in correction_data + if c["task_id"].rsplit("-", 1)[1] == task_uid + ] df = pd.DataFrame(corrected_dump) print(f"Total samples: {len(df)}") for (c, g) in df.groupby("caller"):