diff --git a/jasper/data/process.py b/jasper/data/process.py index 000e843..7a53030 100644 --- a/jasper/data/process.py +++ b/jasper/data/process.py @@ -45,20 +45,28 @@ def split_data(dataset_path: Path, test_size: float = 0.1): @app.command() def validate_data(dataset_path: Path): + from natural.date import compress + from datetime import timedelta + for mf_type in ["train_manifest.json", "test_manifest.json"]: data_file = dataset_path / Path(mf_type) print(f"validating {data_file}.") with Path(data_file).open("r") as pf: pnr_jsonl = pf.readlines() + duration = 0 for (i, s) in enumerate(pnr_jsonl): try: d = json.loads(s) + duration += d["duration"] audio_file = data_file.parent / Path(d["audio_filepath"]) if not audio_file.exists(): raise OSError(f"File {audio_file} not found") except BaseException as e: print(f'failed on {i} with "{e}"') - print(f"no errors found. seems like a valid {mf_type}.") + duration_str = compress(timedelta(seconds=duration), pad=" ") + print( + f"no errors found. seems like a valid {mf_type}. contains {duration_str}sec of audio" + ) def main(): diff --git a/setup.py b/setup.py index 801831c..38f253e 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,8 @@ extra_requirements = { "matplotlib==3.2.1", "pydub~=0.24.0", "streamlit==0.58.0", - "stringcase==1.2.0" + "natural==0.2.0", + "stringcase==1.2.0", ] # "train": [ # "torchaudio==0.5.0",