diff --git a/.gitignore b/.gitignore index 8900361..bda7618 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ /train/ .env* *.yaml +*.json # Created by https://www.gitignore.io/api/python diff --git a/jasper/data/call_recycler.py b/jasper/data/call_recycler.py index 0639f41..4de5ef9 100644 --- a/jasper/data/call_recycler.py +++ b/jasper/data/call_recycler.py @@ -215,7 +215,7 @@ def analyze( assert evs[0]["Type"] == "CONV_RESULT" assert evs[1]["Type"] == "STARTED_SPEAKING" assert evs[2]["Type"] == "STOPPED_SPEAKING" - start_time = td_fn(evs[1]).total_seconds() - 1.5 + start_time = td_fn(evs[1]).total_seconds() - 2 end_time = td_fn(evs[2]).total_seconds() spoken = evs[0]["Msg"] data_points.append( diff --git a/jasper/data/conv_generator.py b/jasper/data/conv_generator.py new file mode 100644 index 0000000..c0e1b2c --- /dev/null +++ b/jasper/data/conv_generator.py @@ -0,0 +1,64 @@ +import typer +from pathlib import Path +from random import randrange +from itertools import product + +app = typer.Typer() + + +@app.command() +def export_conv_json( + conv_src: Path = typer.Option(Path("./conv_data.json"), show_default=True), + conv_dest: Path = typer.Option(Path("./data/conv_data.json"), show_default=True), +): + from .utils import ExtendedPath + + conv_data = ExtendedPath(conv_src).read_json() + + days = [i for i in range(1, 32)] + months = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + ] + # ordinal from https://stackoverflow.com/questions/9647202/ordinal-numbers-replacement + + def ordinal(n): + return "%d%s" % (n, "tsnrhtdd"[(n / 10 % 10 != 1) * (n % 10 < 4) * n % 10 :: 4]) + + def canon_vars(d, m): + return [ + ordinal(d) + " " + m, + m + " " + ordinal(d), + ordinal(d) + " of " + m, + m + " the " + ordinal(d), + str(d) + " " + m, + m + " " + str(d), + ] + + day_months = [dm for d, m in product(days, months) for dm in canon_vars(d, m)] + + conv_data["dates"] = day_months + + def dates_data_gen(): + i = randrange(len(day_months)) + return day_months[i] + + ExtendedPath(conv_dest).write_json(conv_data) + + +def main(): + app() + + +if __name__ == "__main__": + main() diff --git a/jasper/data/unique_nlu.py b/jasper/data/unique_nlu.py new file mode 100644 index 0000000..f2446db --- /dev/null +++ b/jasper/data/unique_nlu.py @@ -0,0 +1,92 @@ +import pandas as pd + + +def compute_pnr_name_city(): + data = pd.read_csv("./customer_utterance_processing/customer_provide_answer.csv") + + def unique_pnr_count(): + pnr_data = data[data["Input.Answer"] == "ZZZZZZ"] + unique_pnr_set = { + t + for n in range(1, 5) + for t in pnr_data[f"Answer.utterance-{n}"].tolist() + if "ZZZZZZ" in t + } + return len(unique_pnr_set) + + def unique_name_count(): + pnr_data = data[data["Input.Answer"] == "John Doe"] + unique_pnr_set = { + t + for n in range(1, 5) + for t in pnr_data[f"Answer.utterance-{n}"].tolist() + if "John Doe" in t + } + return len(unique_pnr_set) + + def unique_city_count(): + pnr_data = data[data["Input.Answer"] == "Heathrow Airport"] + unique_pnr_set = { + t + for n in range(1, 5) + for t in pnr_data[f"Answer.utterance-{n}"].tolist() + if "Heathrow Airport" in t + } + return len(unique_pnr_set) + + def unique_entity_count(entity_template_tags): + # entity_data = data[data['Input.Prompt'] == entity_template_tag] + entity_data = data + unique_entity_set = { + t + for n in range(1, 5) + for t in entity_data[f"Answer.utterance-{n}"].tolist() + if any(et in t for et in entity_template_tags) + } + return len(unique_entity_set) + + print('PNR', unique_pnr_count()) + print('Name', unique_name_count()) + print('City', unique_city_count()) + print('Payment', unique_entity_count(['KPay', 'ZPay', 'Credit Card'])) + + +def compute_date(): + entity_template_tags = ['27 january', 'December 18'] + data = pd.read_csv("./customer_utterance_processing/customer_provide_departure.csv") + # data.sample(10) + + def unique_entity_count(entity_template_tags): + # entity_data = data[data['Input.Prompt'] == entity_template_tag] + entity_data = data + unique_entity_set = { + t + for n in range(1, 5) + for t in entity_data[f"Answer.utterance-{n}"].tolist() + if any(et in t for et in entity_template_tags) + } + return len(unique_entity_set) + + print('Date', unique_entity_count(entity_template_tags)) + + +def compute_option(): + entity_template_tag = 'third' + data = pd.read_csv("./customer_utterance_processing/customer_provide_flight_selection.csv") + + def unique_entity_count(): + entity_data = data[data['Input.Prompt'] == entity_template_tag] + unique_entity_set = { + t + for n in range(1, 5) + for t in entity_data[f"Answer.utterance-{n}"].tolist() + if entity_template_tag in t + } + return len(unique_entity_set) + + print('Option', unique_entity_count()) + + +compute_pnr_name_city() +compute_date() +compute_option() diff --git a/jasper/data/validation/process.py b/jasper/data/validation/process.py index 0f73480..44133ef 100644 --- a/jasper/data/validation/process.py +++ b/jasper/data/validation/process.py @@ -135,7 +135,6 @@ def dump_ui( "annotation_only": annotation_only, "enable_plots": enable_plots, } - typer.echo(f"Writing dump to {dump_path}") ExtendedPath(dump_path).write_json(ui_config) @@ -150,7 +149,6 @@ def dump_corrections( cursor_obj = col.find({"type": "correction"}, projection={"_id": False}) corrections = [c for c in cursor_obj] - typer.echo(f"Writing dump to {dump_path}") ExtendedPath(dump_path).write_json(corrections) diff --git a/setup.py b/setup.py index 87cf7e9..e879af2 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,8 @@ setup( "jasper_transcribe = jasper.transcribe:main", "jasper_server = jasper.server:main", "jasper_trainer = jasper.training.cli:main", - "jasper_data_generate = jasper.data.tts_generator:main", + "jasper_data_tts_generate = jasper.data.tts_generator:main", + "jasper_data_conv_generate = jasper.data.conv_generator:main", "jasper_data_call_recycle = jasper.data.call_recycler:main", "jasper_data_asr_recycle = jasper.data.asr_recycler:main", "jasper_data_rev_recycle = jasper.data.rev_recycler:main",