show duration on validation of dataset
parent
de21952349
commit
9f9cb62b60
|
|
@ -45,20 +45,28 @@ def split_data(dataset_path: Path, test_size: float = 0.1):
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def validate_data(dataset_path: Path):
|
def validate_data(dataset_path: Path):
|
||||||
|
from natural.date import compress
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
for mf_type in ["train_manifest.json", "test_manifest.json"]:
|
for mf_type in ["train_manifest.json", "test_manifest.json"]:
|
||||||
data_file = dataset_path / Path(mf_type)
|
data_file = dataset_path / Path(mf_type)
|
||||||
print(f"validating {data_file}.")
|
print(f"validating {data_file}.")
|
||||||
with Path(data_file).open("r") as pf:
|
with Path(data_file).open("r") as pf:
|
||||||
pnr_jsonl = pf.readlines()
|
pnr_jsonl = pf.readlines()
|
||||||
|
duration = 0
|
||||||
for (i, s) in enumerate(pnr_jsonl):
|
for (i, s) in enumerate(pnr_jsonl):
|
||||||
try:
|
try:
|
||||||
d = json.loads(s)
|
d = json.loads(s)
|
||||||
|
duration += d["duration"]
|
||||||
audio_file = data_file.parent / Path(d["audio_filepath"])
|
audio_file = data_file.parent / Path(d["audio_filepath"])
|
||||||
if not audio_file.exists():
|
if not audio_file.exists():
|
||||||
raise OSError(f"File {audio_file} not found")
|
raise OSError(f"File {audio_file} not found")
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
print(f'failed on {i} with "{e}"')
|
print(f'failed on {i} with "{e}"')
|
||||||
print(f"no errors found. seems like a valid {mf_type}.")
|
duration_str = compress(timedelta(seconds=duration), pad=" ")
|
||||||
|
print(
|
||||||
|
f"no errors found. seems like a valid {mf_type}. contains {duration_str}sec of audio"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
|
||||||
3
setup.py
3
setup.py
|
|
@ -37,7 +37,8 @@ extra_requirements = {
|
||||||
"matplotlib==3.2.1",
|
"matplotlib==3.2.1",
|
||||||
"pydub~=0.24.0",
|
"pydub~=0.24.0",
|
||||||
"streamlit==0.58.0",
|
"streamlit==0.58.0",
|
||||||
"stringcase==1.2.0"
|
"natural==0.2.0",
|
||||||
|
"stringcase==1.2.0",
|
||||||
]
|
]
|
||||||
# "train": [
|
# "train": [
|
||||||
# "torchaudio==0.5.0",
|
# "torchaudio==0.5.0",
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue