show duration on validation of dataset

Malar Kannan 2020-05-28 11:35:31 +05:30
parent de21952349
commit 9f9cb62b60
2 changed files with 11 additions and 2 deletions

View File

@ -45,20 +45,28 @@ def split_data(dataset_path: Path, test_size: float = 0.1):
@app.command() @app.command()
def validate_data(dataset_path: Path): def validate_data(dataset_path: Path):
from natural.date import compress
from datetime import timedelta
for mf_type in ["train_manifest.json", "test_manifest.json"]: for mf_type in ["train_manifest.json", "test_manifest.json"]:
data_file = dataset_path / Path(mf_type) data_file = dataset_path / Path(mf_type)
print(f"validating {data_file}.") print(f"validating {data_file}.")
with Path(data_file).open("r") as pf: with Path(data_file).open("r") as pf:
pnr_jsonl = pf.readlines() pnr_jsonl = pf.readlines()
duration = 0
for (i, s) in enumerate(pnr_jsonl): for (i, s) in enumerate(pnr_jsonl):
try: try:
d = json.loads(s) d = json.loads(s)
duration += d["duration"]
audio_file = data_file.parent / Path(d["audio_filepath"]) audio_file = data_file.parent / Path(d["audio_filepath"])
if not audio_file.exists(): if not audio_file.exists():
raise OSError(f"File {audio_file} not found") raise OSError(f"File {audio_file} not found")
except BaseException as e: except BaseException as e:
print(f'failed on {i} with "{e}"') print(f'failed on {i} with "{e}"')
print(f"no errors found. seems like a valid {mf_type}.") duration_str = compress(timedelta(seconds=duration), pad=" ")
print(
f"no errors found. seems like a valid {mf_type}. contains {duration_str}sec of audio"
)
def main(): def main():

View File

@ -37,7 +37,8 @@ extra_requirements = {
"matplotlib==3.2.1", "matplotlib==3.2.1",
"pydub~=0.24.0", "pydub~=0.24.0",
"streamlit==0.58.0", "streamlit==0.58.0",
"stringcase==1.2.0" "natural==0.2.0",
"stringcase==1.2.0",
] ]
# "train": [ # "train": [
# "torchaudio==0.5.0", # "torchaudio==0.5.0",