1. added conv data generator

2. more utils
Malar Kannan 2020-06-16 15:38:07 +05:30
parent 7472b6457d
commit 7dbb04dcbf
6 changed files with 160 additions and 4 deletions

1
.gitignore vendored
View File

@ -3,6 +3,7 @@
/train/ /train/
.env* .env*
*.yaml *.yaml
*.json
# Created by https://www.gitignore.io/api/python # Created by https://www.gitignore.io/api/python

View File

@ -215,7 +215,7 @@ def analyze(
assert evs[0]["Type"] == "CONV_RESULT" assert evs[0]["Type"] == "CONV_RESULT"
assert evs[1]["Type"] == "STARTED_SPEAKING" assert evs[1]["Type"] == "STARTED_SPEAKING"
assert evs[2]["Type"] == "STOPPED_SPEAKING" assert evs[2]["Type"] == "STOPPED_SPEAKING"
start_time = td_fn(evs[1]).total_seconds() - 1.5 start_time = td_fn(evs[1]).total_seconds() - 2
end_time = td_fn(evs[2]).total_seconds() end_time = td_fn(evs[2]).total_seconds()
spoken = evs[0]["Msg"] spoken = evs[0]["Msg"]
data_points.append( data_points.append(

View File

@ -0,0 +1,64 @@
import typer
from pathlib import Path
from random import randrange
from itertools import product
app = typer.Typer()
@app.command()
def export_conv_json(
conv_src: Path = typer.Option(Path("./conv_data.json"), show_default=True),
conv_dest: Path = typer.Option(Path("./data/conv_data.json"), show_default=True),
):
from .utils import ExtendedPath
conv_data = ExtendedPath(conv_src).read_json()
days = [i for i in range(1, 32)]
months = [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December",
]
# ordinal from https://stackoverflow.com/questions/9647202/ordinal-numbers-replacement
def ordinal(n):
return "%d%s" % (n, "tsnrhtdd"[(n / 10 % 10 != 1) * (n % 10 < 4) * n % 10 :: 4])
def canon_vars(d, m):
return [
ordinal(d) + " " + m,
m + " " + ordinal(d),
ordinal(d) + " of " + m,
m + " the " + ordinal(d),
str(d) + " " + m,
m + " " + str(d),
]
day_months = [dm for d, m in product(days, months) for dm in canon_vars(d, m)]
conv_data["dates"] = day_months
def dates_data_gen():
i = randrange(len(day_months))
return day_months[i]
ExtendedPath(conv_dest).write_json(conv_data)
def main():
app()
if __name__ == "__main__":
main()

92
jasper/data/unique_nlu.py Normal file
View File

@ -0,0 +1,92 @@
import pandas as pd
def compute_pnr_name_city():
data = pd.read_csv("./customer_utterance_processing/customer_provide_answer.csv")
def unique_pnr_count():
pnr_data = data[data["Input.Answer"] == "ZZZZZZ"]
unique_pnr_set = {
t
for n in range(1, 5)
for t in pnr_data[f"Answer.utterance-{n}"].tolist()
if "ZZZZZZ" in t
}
return len(unique_pnr_set)
def unique_name_count():
pnr_data = data[data["Input.Answer"] == "John Doe"]
unique_pnr_set = {
t
for n in range(1, 5)
for t in pnr_data[f"Answer.utterance-{n}"].tolist()
if "John Doe" in t
}
return len(unique_pnr_set)
def unique_city_count():
pnr_data = data[data["Input.Answer"] == "Heathrow Airport"]
unique_pnr_set = {
t
for n in range(1, 5)
for t in pnr_data[f"Answer.utterance-{n}"].tolist()
if "Heathrow Airport" in t
}
return len(unique_pnr_set)
def unique_entity_count(entity_template_tags):
# entity_data = data[data['Input.Prompt'] == entity_template_tag]
entity_data = data
unique_entity_set = {
t
for n in range(1, 5)
for t in entity_data[f"Answer.utterance-{n}"].tolist()
if any(et in t for et in entity_template_tags)
}
return len(unique_entity_set)
print('PNR', unique_pnr_count())
print('Name', unique_name_count())
print('City', unique_city_count())
print('Payment', unique_entity_count(['KPay', 'ZPay', 'Credit Card']))
def compute_date():
entity_template_tags = ['27 january', 'December 18']
data = pd.read_csv("./customer_utterance_processing/customer_provide_departure.csv")
# data.sample(10)
def unique_entity_count(entity_template_tags):
# entity_data = data[data['Input.Prompt'] == entity_template_tag]
entity_data = data
unique_entity_set = {
t
for n in range(1, 5)
for t in entity_data[f"Answer.utterance-{n}"].tolist()
if any(et in t for et in entity_template_tags)
}
return len(unique_entity_set)
print('Date', unique_entity_count(entity_template_tags))
def compute_option():
entity_template_tag = 'third'
data = pd.read_csv("./customer_utterance_processing/customer_provide_flight_selection.csv")
def unique_entity_count():
entity_data = data[data['Input.Prompt'] == entity_template_tag]
unique_entity_set = {
t
for n in range(1, 5)
for t in entity_data[f"Answer.utterance-{n}"].tolist()
if entity_template_tag in t
}
return len(unique_entity_set)
print('Option', unique_entity_count())
compute_pnr_name_city()
compute_date()
compute_option()

View File

@ -135,7 +135,6 @@ def dump_ui(
"annotation_only": annotation_only, "annotation_only": annotation_only,
"enable_plots": enable_plots, "enable_plots": enable_plots,
} }
typer.echo(f"Writing dump to {dump_path}")
ExtendedPath(dump_path).write_json(ui_config) ExtendedPath(dump_path).write_json(ui_config)
@ -150,7 +149,6 @@ def dump_corrections(
cursor_obj = col.find({"type": "correction"}, projection={"_id": False}) cursor_obj = col.find({"type": "correction"}, projection={"_id": False})
corrections = [c for c in cursor_obj] corrections = [c for c in cursor_obj]
typer.echo(f"Writing dump to {dump_path}")
ExtendedPath(dump_path).write_json(corrections) ExtendedPath(dump_path).write_json(corrections)

View File

@ -63,7 +63,8 @@ setup(
"jasper_transcribe = jasper.transcribe:main", "jasper_transcribe = jasper.transcribe:main",
"jasper_server = jasper.server:main", "jasper_server = jasper.server:main",
"jasper_trainer = jasper.training.cli:main", "jasper_trainer = jasper.training.cli:main",
"jasper_data_generate = jasper.data.tts_generator:main", "jasper_data_tts_generate = jasper.data.tts_generator:main",
"jasper_data_conv_generate = jasper.data.conv_generator:main",
"jasper_data_call_recycle = jasper.data.call_recycler:main", "jasper_data_call_recycle = jasper.data.call_recycler:main",
"jasper_data_asr_recycle = jasper.data.asr_recycler:main", "jasper_data_asr_recycle = jasper.data.asr_recycler:main",
"jasper_data_rev_recycle = jasper.data.rev_recycler:main", "jasper_data_rev_recycle = jasper.data.rev_recycler:main",