added support for name/dates/cities call data extraction and more logs

Malar Kannan 2020-06-15 10:24:38 +05:30
parent a7a25e9b07
commit 120302aad3
2 changed files with 79 additions and 1 deletions

View File

@ -95,10 +95,12 @@ class ExtendedPath(type(Path())):
"""docstring for ExtendedPath.""" """docstring for ExtendedPath."""
def read_json(self): def read_json(self):
print(f'reading json from {self}')
with self.open("r") as jf: with self.open("r") as jf:
return json.load(jf) return json.load(jf)
def write_json(self, data): def write_json(self, data):
print(f'writing json to {self}')
self.parent.mkdir(parents=True, exist_ok=True) self.parent.mkdir(parents=True, exist_ok=True)
with self.open("w") as jf: with self.open("w") as jf:
return json.dump(data, jf, indent=2) return json.dump(data, jf, indent=2)

View File

@ -1,6 +1,7 @@
import json import json
import shutil import shutil
from pathlib import Path from pathlib import Path
from enum import Enum
import typer import typer
from tqdm import tqdm from tqdm import tqdm
@ -176,6 +177,81 @@ def fill_unannotated(
) )
class ExtractionType(str, Enum):
date = "dates"
city = "cities"
name = "names"
@app.command()
def split_extract(
data_name: str = typer.Option("call_alphanum", show_default=True),
# dest_data_name: str = typer.Option("call_aldata_namephanum_date", show_default=True),
dump_dir: Path = Path("./data/valiation_data"),
dump_file: Path = Path("ui_dump.json"),
manifest_dir: Path = Path("./data/asr_data"),
manifest_file: Path = Path("manifest.json"),
corrections_file: Path = Path("corrections.json"),
conv_data_path: Path = Path("./data/conv_data.json"),
extraction_type: ExtractionType = ExtractionType.date,
):
import shutil
def get_conv_data(cdp):
from itertools import product
conv_data = json.load(cdp.open())
days = [str(i) for i in range(1, 32)]
months = conv_data["months"]
day_months = {d + " " + m for d, m in product(days, months)}
return {
"cities": set(conv_data["cities"]),
"names": set(conv_data["names"]),
"dates": day_months,
}
dest_data_name = data_name + "_" + extraction_type.value
data_manifest_path = manifest_dir / Path(data_name) / manifest_file
conv_data = get_conv_data(conv_data_path)
extraction_vals = conv_data[extraction_type.value]
manifest_gen = asr_manifest_reader(data_manifest_path)
dest_data_dir = manifest_dir / Path(dest_data_name)
dest_data_dir.mkdir(exist_ok=True, parents=True)
(dest_data_dir / Path("wav")).mkdir(exist_ok=True, parents=True)
dest_manifest_path = dest_data_dir / manifest_file
dest_ui_dir = dump_dir / Path(dest_data_name)
dest_ui_dir.mkdir(exist_ok=True, parents=True)
dest_ui_path = dest_ui_dir / dump_file
dest_correction_path = dest_ui_dir / corrections_file
def extract_manifest(mg):
for m in mg:
if m["text"] in extraction_vals:
shutil.copy(m["audio_path"], dest_data_dir / Path(m["audio_filepath"]))
yield m
asr_manifest_writer(dest_manifest_path, extract_manifest(manifest_gen))
ui_data_path = dump_dir / Path(data_name) / dump_file
corrections_path = dump_dir / Path(data_name) / corrections_file
ui_data = json.load(ui_data_path.open())["data"]
file_ui_map = {Path(u["audio_filepath"]).stem: u for u in ui_data}
corrections = json.load(corrections_path.open())
extracted_ui_data = list(filter(lambda u: u["text"] in extraction_vals, ui_data))
ExtendedPath(dest_ui_path).write_json(extracted_ui_data)
extracted_corrections = list(
filter(
lambda c: c["code"] in file_ui_map
and file_ui_map[c["code"]]["text"] in extraction_vals,
corrections,
)
)
ExtendedPath(dest_correction_path).write_json(extracted_corrections)
@app.command() @app.command()
def update_corrections( def update_corrections(
data_name: str = typer.Option("call_alphanum", show_default=True), data_name: str = typer.Option("call_alphanum", show_default=True),
@ -188,7 +264,7 @@ def update_corrections(
skip_incorrect: bool = True, skip_incorrect: bool = True,
): ):
data_manifest_path = manifest_dir / Path(data_name) / manifest_file data_manifest_path = manifest_dir / Path(data_name) / manifest_file
corrections_path = manifest_dir / Path(data_name) / corrections_file corrections_path = dump_dir / Path(data_name) / corrections_file
def correct_manifest(manifest_data_gen, corrections_path): def correct_manifest(manifest_data_gen, corrections_path):
corrections = json.load(corrections_path.open()) corrections = json.load(corrections_path.open())