1. added conv data generator
2. more utils
parent
7472b6457d
commit
7dbb04dcbf
|
|
@ -3,6 +3,7 @@
|
||||||
/train/
|
/train/
|
||||||
.env*
|
.env*
|
||||||
*.yaml
|
*.yaml
|
||||||
|
*.json
|
||||||
|
|
||||||
|
|
||||||
# Created by https://www.gitignore.io/api/python
|
# Created by https://www.gitignore.io/api/python
|
||||||
|
|
|
||||||
|
|
@ -215,7 +215,7 @@ def analyze(
|
||||||
assert evs[0]["Type"] == "CONV_RESULT"
|
assert evs[0]["Type"] == "CONV_RESULT"
|
||||||
assert evs[1]["Type"] == "STARTED_SPEAKING"
|
assert evs[1]["Type"] == "STARTED_SPEAKING"
|
||||||
assert evs[2]["Type"] == "STOPPED_SPEAKING"
|
assert evs[2]["Type"] == "STOPPED_SPEAKING"
|
||||||
start_time = td_fn(evs[1]).total_seconds() - 1.5
|
start_time = td_fn(evs[1]).total_seconds() - 2
|
||||||
end_time = td_fn(evs[2]).total_seconds()
|
end_time = td_fn(evs[2]).total_seconds()
|
||||||
spoken = evs[0]["Msg"]
|
spoken = evs[0]["Msg"]
|
||||||
data_points.append(
|
data_points.append(
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,64 @@
|
||||||
|
import typer
|
||||||
|
from pathlib import Path
|
||||||
|
from random import randrange
|
||||||
|
from itertools import product
|
||||||
|
|
||||||
|
app = typer.Typer()
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def export_conv_json(
|
||||||
|
conv_src: Path = typer.Option(Path("./conv_data.json"), show_default=True),
|
||||||
|
conv_dest: Path = typer.Option(Path("./data/conv_data.json"), show_default=True),
|
||||||
|
):
|
||||||
|
from .utils import ExtendedPath
|
||||||
|
|
||||||
|
conv_data = ExtendedPath(conv_src).read_json()
|
||||||
|
|
||||||
|
days = [i for i in range(1, 32)]
|
||||||
|
months = [
|
||||||
|
"January",
|
||||||
|
"February",
|
||||||
|
"March",
|
||||||
|
"April",
|
||||||
|
"May",
|
||||||
|
"June",
|
||||||
|
"July",
|
||||||
|
"August",
|
||||||
|
"September",
|
||||||
|
"October",
|
||||||
|
"November",
|
||||||
|
"December",
|
||||||
|
]
|
||||||
|
# ordinal from https://stackoverflow.com/questions/9647202/ordinal-numbers-replacement
|
||||||
|
|
||||||
|
def ordinal(n):
|
||||||
|
return "%d%s" % (n, "tsnrhtdd"[(n / 10 % 10 != 1) * (n % 10 < 4) * n % 10 :: 4])
|
||||||
|
|
||||||
|
def canon_vars(d, m):
|
||||||
|
return [
|
||||||
|
ordinal(d) + " " + m,
|
||||||
|
m + " " + ordinal(d),
|
||||||
|
ordinal(d) + " of " + m,
|
||||||
|
m + " the " + ordinal(d),
|
||||||
|
str(d) + " " + m,
|
||||||
|
m + " " + str(d),
|
||||||
|
]
|
||||||
|
|
||||||
|
day_months = [dm for d, m in product(days, months) for dm in canon_vars(d, m)]
|
||||||
|
|
||||||
|
conv_data["dates"] = day_months
|
||||||
|
|
||||||
|
def dates_data_gen():
|
||||||
|
i = randrange(len(day_months))
|
||||||
|
return day_months[i]
|
||||||
|
|
||||||
|
ExtendedPath(conv_dest).write_json(conv_data)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
app()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -0,0 +1,92 @@
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def compute_pnr_name_city():
|
||||||
|
data = pd.read_csv("./customer_utterance_processing/customer_provide_answer.csv")
|
||||||
|
|
||||||
|
def unique_pnr_count():
|
||||||
|
pnr_data = data[data["Input.Answer"] == "ZZZZZZ"]
|
||||||
|
unique_pnr_set = {
|
||||||
|
t
|
||||||
|
for n in range(1, 5)
|
||||||
|
for t in pnr_data[f"Answer.utterance-{n}"].tolist()
|
||||||
|
if "ZZZZZZ" in t
|
||||||
|
}
|
||||||
|
return len(unique_pnr_set)
|
||||||
|
|
||||||
|
def unique_name_count():
|
||||||
|
pnr_data = data[data["Input.Answer"] == "John Doe"]
|
||||||
|
unique_pnr_set = {
|
||||||
|
t
|
||||||
|
for n in range(1, 5)
|
||||||
|
for t in pnr_data[f"Answer.utterance-{n}"].tolist()
|
||||||
|
if "John Doe" in t
|
||||||
|
}
|
||||||
|
return len(unique_pnr_set)
|
||||||
|
|
||||||
|
def unique_city_count():
|
||||||
|
pnr_data = data[data["Input.Answer"] == "Heathrow Airport"]
|
||||||
|
unique_pnr_set = {
|
||||||
|
t
|
||||||
|
for n in range(1, 5)
|
||||||
|
for t in pnr_data[f"Answer.utterance-{n}"].tolist()
|
||||||
|
if "Heathrow Airport" in t
|
||||||
|
}
|
||||||
|
return len(unique_pnr_set)
|
||||||
|
|
||||||
|
def unique_entity_count(entity_template_tags):
|
||||||
|
# entity_data = data[data['Input.Prompt'] == entity_template_tag]
|
||||||
|
entity_data = data
|
||||||
|
unique_entity_set = {
|
||||||
|
t
|
||||||
|
for n in range(1, 5)
|
||||||
|
for t in entity_data[f"Answer.utterance-{n}"].tolist()
|
||||||
|
if any(et in t for et in entity_template_tags)
|
||||||
|
}
|
||||||
|
return len(unique_entity_set)
|
||||||
|
|
||||||
|
print('PNR', unique_pnr_count())
|
||||||
|
print('Name', unique_name_count())
|
||||||
|
print('City', unique_city_count())
|
||||||
|
print('Payment', unique_entity_count(['KPay', 'ZPay', 'Credit Card']))
|
||||||
|
|
||||||
|
|
||||||
|
def compute_date():
|
||||||
|
entity_template_tags = ['27 january', 'December 18']
|
||||||
|
data = pd.read_csv("./customer_utterance_processing/customer_provide_departure.csv")
|
||||||
|
# data.sample(10)
|
||||||
|
|
||||||
|
def unique_entity_count(entity_template_tags):
|
||||||
|
# entity_data = data[data['Input.Prompt'] == entity_template_tag]
|
||||||
|
entity_data = data
|
||||||
|
unique_entity_set = {
|
||||||
|
t
|
||||||
|
for n in range(1, 5)
|
||||||
|
for t in entity_data[f"Answer.utterance-{n}"].tolist()
|
||||||
|
if any(et in t for et in entity_template_tags)
|
||||||
|
}
|
||||||
|
return len(unique_entity_set)
|
||||||
|
|
||||||
|
print('Date', unique_entity_count(entity_template_tags))
|
||||||
|
|
||||||
|
|
||||||
|
def compute_option():
|
||||||
|
entity_template_tag = 'third'
|
||||||
|
data = pd.read_csv("./customer_utterance_processing/customer_provide_flight_selection.csv")
|
||||||
|
|
||||||
|
def unique_entity_count():
|
||||||
|
entity_data = data[data['Input.Prompt'] == entity_template_tag]
|
||||||
|
unique_entity_set = {
|
||||||
|
t
|
||||||
|
for n in range(1, 5)
|
||||||
|
for t in entity_data[f"Answer.utterance-{n}"].tolist()
|
||||||
|
if entity_template_tag in t
|
||||||
|
}
|
||||||
|
return len(unique_entity_set)
|
||||||
|
|
||||||
|
print('Option', unique_entity_count())
|
||||||
|
|
||||||
|
|
||||||
|
compute_pnr_name_city()
|
||||||
|
compute_date()
|
||||||
|
compute_option()
|
||||||
|
|
@ -135,7 +135,6 @@ def dump_ui(
|
||||||
"annotation_only": annotation_only,
|
"annotation_only": annotation_only,
|
||||||
"enable_plots": enable_plots,
|
"enable_plots": enable_plots,
|
||||||
}
|
}
|
||||||
typer.echo(f"Writing dump to {dump_path}")
|
|
||||||
ExtendedPath(dump_path).write_json(ui_config)
|
ExtendedPath(dump_path).write_json(ui_config)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -150,7 +149,6 @@ def dump_corrections(
|
||||||
|
|
||||||
cursor_obj = col.find({"type": "correction"}, projection={"_id": False})
|
cursor_obj = col.find({"type": "correction"}, projection={"_id": False})
|
||||||
corrections = [c for c in cursor_obj]
|
corrections = [c for c in cursor_obj]
|
||||||
typer.echo(f"Writing dump to {dump_path}")
|
|
||||||
ExtendedPath(dump_path).write_json(corrections)
|
ExtendedPath(dump_path).write_json(corrections)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
3
setup.py
3
setup.py
|
|
@ -63,7 +63,8 @@ setup(
|
||||||
"jasper_transcribe = jasper.transcribe:main",
|
"jasper_transcribe = jasper.transcribe:main",
|
||||||
"jasper_server = jasper.server:main",
|
"jasper_server = jasper.server:main",
|
||||||
"jasper_trainer = jasper.training.cli:main",
|
"jasper_trainer = jasper.training.cli:main",
|
||||||
"jasper_data_generate = jasper.data.tts_generator:main",
|
"jasper_data_tts_generate = jasper.data.tts_generator:main",
|
||||||
|
"jasper_data_conv_generate = jasper.data.conv_generator:main",
|
||||||
"jasper_data_call_recycle = jasper.data.call_recycler:main",
|
"jasper_data_call_recycle = jasper.data.call_recycler:main",
|
||||||
"jasper_data_asr_recycle = jasper.data.asr_recycler:main",
|
"jasper_data_asr_recycle = jasper.data.asr_recycler:main",
|
||||||
"jasper_data_rev_recycle = jasper.data.rev_recycler:main",
|
"jasper_data_rev_recycle = jasper.data.rev_recycler:main",
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue