1. added utility command to export call logs
2. mongo conn accepts port
parent
8e79bbb571
commit
2d5b720284
|
|
@ -12,15 +12,52 @@ app = typer.Typer()
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def export_logs(call_logs_file: Path = Path("./call_sia_logs.yaml")):
|
def export_all_logs(call_logs_file: Path = Path("./call_sia_logs.yaml")):
|
||||||
from pymongo import MongoClient
|
from .utils import get_mongo_conn
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from ruamel.yaml import YAML
|
from ruamel.yaml import YAML
|
||||||
|
|
||||||
yaml = YAML()
|
yaml = YAML()
|
||||||
mongo_collection = MongoClient("mongodb://localhost:27017/").test.calls
|
mongo_coll = get_mongo_conn().test.calls
|
||||||
caller_calls = defaultdict(lambda: [])
|
caller_calls = defaultdict(lambda: [])
|
||||||
for call in mongo_collection.find():
|
for call in mongo_coll.find():
|
||||||
|
sysid = call["SystemID"]
|
||||||
|
call_uri = f"http://sia-data.agaralabs.com/calls/{sysid}"
|
||||||
|
caller = call["Caller"]
|
||||||
|
caller_calls[caller].append(call_uri)
|
||||||
|
caller_list = []
|
||||||
|
for caller in caller_calls:
|
||||||
|
caller_list.append({"name": caller, "calls": caller_calls[caller]})
|
||||||
|
output_yaml = {"users": caller_list}
|
||||||
|
typer.echo("exporting call logs to yaml file")
|
||||||
|
with call_logs_file.open("w") as yf:
|
||||||
|
yaml.dump(output_yaml, yf)
|
||||||
|
|
||||||
|
|
||||||
|
@app.command()
|
||||||
|
def export_calls_between(
|
||||||
|
start_cid: str,
|
||||||
|
end_cid: str,
|
||||||
|
call_logs_file: Path = Path("./call_sia_logs.yaml"),
|
||||||
|
mongo_port: int = 27017,
|
||||||
|
):
|
||||||
|
from collections import defaultdict
|
||||||
|
from ruamel.yaml import YAML
|
||||||
|
from .utils import get_mongo_conn
|
||||||
|
|
||||||
|
yaml = YAML()
|
||||||
|
mongo_coll = get_mongo_conn(port=mongo_port).test.calls
|
||||||
|
start_meta = mongo_coll.find_one({"SystemID": start_cid})
|
||||||
|
end_meta = mongo_coll.find_one({"SystemID": end_cid})
|
||||||
|
|
||||||
|
caller_calls = defaultdict(lambda: [])
|
||||||
|
call_query = mongo_coll.find(
|
||||||
|
{
|
||||||
|
"StartTS": {"$gte": start_meta["StartTS"]},
|
||||||
|
"EndTS": {"$lte": end_meta["EndTS"]},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
for call in call_query:
|
||||||
sysid = call["SystemID"]
|
sysid = call["SystemID"]
|
||||||
call_uri = f"http://sia-data.agaralabs.com/calls/{sysid}"
|
call_uri = f"http://sia-data.agaralabs.com/calls/{sysid}"
|
||||||
caller = call["Caller"]
|
caller = call["Caller"]
|
||||||
|
|
@ -39,13 +76,14 @@ def analyze(
|
||||||
leaderboard: bool = False,
|
leaderboard: bool = False,
|
||||||
plot_calls: bool = False,
|
plot_calls: bool = False,
|
||||||
extract_data: bool = False,
|
extract_data: bool = False,
|
||||||
|
download_only: bool = False,
|
||||||
call_logs_file: Path = Path("./call_logs.yaml"),
|
call_logs_file: Path = Path("./call_logs.yaml"),
|
||||||
output_dir: Path = Path("./data"),
|
output_dir: Path = Path("./data"),
|
||||||
|
mongo_port: int = 27017,
|
||||||
):
|
):
|
||||||
|
|
||||||
from urllib.parse import urlsplit
|
from urllib.parse import urlsplit
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
from pymongo import MongoClient
|
|
||||||
import boto3
|
import boto3
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
@ -64,7 +102,7 @@ def analyze(
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import matplotlib
|
import matplotlib
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from .utils import asr_data_writer
|
from .utils import asr_data_writer, get_mongo_conn
|
||||||
from pydub import AudioSegment
|
from pydub import AudioSegment
|
||||||
from natural.date import compress
|
from natural.date import compress
|
||||||
|
|
||||||
|
|
@ -80,7 +118,7 @@ def analyze(
|
||||||
# logger = logging.getLogger(__name__)
|
# logger = logging.getLogger(__name__)
|
||||||
yaml = YAML()
|
yaml = YAML()
|
||||||
s3 = boto3.client("s3")
|
s3 = boto3.client("s3")
|
||||||
mongo_collection = MongoClient("mongodb://localhost:27017/").test.calls
|
mongo_collection = get_mongo_conn(port=mongo_port).test.calls
|
||||||
call_media_dir: Path = output_dir / Path("call_wavs")
|
call_media_dir: Path = output_dir / Path("call_wavs")
|
||||||
call_media_dir.mkdir(exist_ok=True, parents=True)
|
call_media_dir.mkdir(exist_ok=True, parents=True)
|
||||||
call_meta_dir: Path = output_dir / Path("call_metas")
|
call_meta_dir: Path = output_dir / Path("call_metas")
|
||||||
|
|
@ -118,15 +156,32 @@ def analyze(
|
||||||
|
|
||||||
return get_timedelta
|
return get_timedelta
|
||||||
|
|
||||||
|
def chunk_n(evs, n):
|
||||||
|
return [evs[i * n : (i + 1) * n] for i in range((len(evs) + n - 1) // n)]
|
||||||
|
|
||||||
|
def get_data_points(utter_events, td_fn):
|
||||||
|
data_points = []
|
||||||
|
for evs in chunk_n(utter_events, 3):
|
||||||
|
assert evs[0]["Type"] == "CONV_RESULT"
|
||||||
|
assert evs[1]["Type"] == "STARTED_SPEAKING"
|
||||||
|
assert evs[2]["Type"] == "STOPPED_SPEAKING"
|
||||||
|
start_time = td_fn(evs[1]).total_seconds() - 1.5
|
||||||
|
end_time = td_fn(evs[2]).total_seconds()
|
||||||
|
code = evs[0]["Msg"]
|
||||||
|
data_points.append(
|
||||||
|
{"start_time": start_time, "end_time": end_time, "code": code}
|
||||||
|
)
|
||||||
|
return data_points
|
||||||
|
|
||||||
def process_call(call_obj):
|
def process_call(call_obj):
|
||||||
call_meta = get_call_meta(call_obj)
|
call_meta = get_call_meta(call_obj)
|
||||||
call_events = call_meta["Events"]
|
call_events = call_meta["Events"]
|
||||||
|
|
||||||
def is_writer_event(ev):
|
def is_writer_uri_event(ev):
|
||||||
return ev["Author"] == "AUDIO_WRITER"
|
return ev["Author"] == "AUDIO_WRITER" and 's3://' in ev["Msg"]
|
||||||
|
|
||||||
writer_events = list(filter(is_writer_event, call_events))
|
writer_events = list(filter(is_writer_uri_event, call_events))
|
||||||
s3_wav_url = re.search(r"saved to: (.*)", writer_events[0]["Msg"]).groups(0)[0]
|
s3_wav_url = re.search(r"(s3://.*)", writer_events[0]["Msg"]).groups(0)[0]
|
||||||
s3_wav_url_p = urlsplit(s3_wav_url)
|
s3_wav_url_p = urlsplit(s3_wav_url)
|
||||||
|
|
||||||
def is_first_audio_ev(state, ev):
|
def is_first_audio_ev(state, ev):
|
||||||
|
|
@ -157,22 +212,6 @@ def analyze(
|
||||||
)
|
)
|
||||||
|
|
||||||
# %config InlineBackend.figure_format = "retina"
|
# %config InlineBackend.figure_format = "retina"
|
||||||
def chunk_n(evs, n):
|
|
||||||
return [evs[i * n : (i + 1) * n] for i in range((len(evs) + n - 1) // n)]
|
|
||||||
|
|
||||||
def get_data_points(utter_events):
|
|
||||||
data_points = []
|
|
||||||
for evs in chunk_n(utter_events, 3):
|
|
||||||
assert evs[0]["Type"] == "CONV_RESULT"
|
|
||||||
assert evs[1]["Type"] == "STARTED_SPEAKING"
|
|
||||||
assert evs[2]["Type"] == "STOPPED_SPEAKING"
|
|
||||||
start_time = get_ev_fev_timedelta(evs[1]).total_seconds() - 1.5
|
|
||||||
end_time = get_ev_fev_timedelta(evs[2]).total_seconds()
|
|
||||||
code = evs[0]["Msg"]
|
|
||||||
data_points.append(
|
|
||||||
{"start_time": start_time, "end_time": end_time, "code": code}
|
|
||||||
)
|
|
||||||
return data_points
|
|
||||||
|
|
||||||
def plot_events(y, sr, utter_events, file_path):
|
def plot_events(y, sr, utter_events, file_path):
|
||||||
plt.figure(figsize=(16, 12))
|
plt.figure(figsize=(16, 12))
|
||||||
|
|
@ -202,22 +241,36 @@ def analyze(
|
||||||
plt.title("Monophonic")
|
plt.title("Monophonic")
|
||||||
plt.savefig(file_path, format="png")
|
plt.savefig(file_path, format="png")
|
||||||
|
|
||||||
data_points = get_data_points(utter_events)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"wav_path": saved_wav_path,
|
"wav_path": saved_wav_path,
|
||||||
"num_samples": len(utter_events) // 3,
|
"num_samples": len(utter_events) // 3,
|
||||||
"meta": call_obj,
|
"meta": call_obj,
|
||||||
"data_points": data_points,
|
"first_event_fn": get_ev_fev_timedelta,
|
||||||
|
"utter_events": utter_events,
|
||||||
}
|
}
|
||||||
|
|
||||||
def retrieve_callmeta(uri):
|
def get_cid(uri):
|
||||||
cid = Path(urlsplit(uri).path).stem
|
return Path(urlsplit(uri).path).stem
|
||||||
|
|
||||||
|
def ensure_call(uri):
|
||||||
|
cid = get_cid(uri)
|
||||||
|
meta = mongo_collection.find_one({"SystemID": cid})
|
||||||
|
process_meta = process_call(meta)
|
||||||
|
return process_meta
|
||||||
|
|
||||||
|
def retrieve_processed_callmeta(uri):
|
||||||
|
cid = get_cid(uri)
|
||||||
meta = mongo_collection.find_one({"SystemID": cid})
|
meta = mongo_collection.find_one({"SystemID": cid})
|
||||||
duration = meta["EndTS"] - meta["StartTS"]
|
duration = meta["EndTS"] - meta["StartTS"]
|
||||||
process_meta = process_call(meta)
|
process_meta = process_call(meta)
|
||||||
|
data_points = get_data_points(process_meta['utter_events'], process_meta['first_event_fn'])
|
||||||
|
process_meta['data_points'] = data_points
|
||||||
return {"url": uri, "meta": meta, "duration": duration, "process": process_meta}
|
return {"url": uri, "meta": meta, "duration": duration, "process": process_meta}
|
||||||
|
|
||||||
|
def download_meta_audio():
|
||||||
|
call_lens = lens["users"].Each()["calls"].Each()
|
||||||
|
call_lens.modify(ensure_call)(call_logs)
|
||||||
|
|
||||||
# @plot_app.command()
|
# @plot_app.command()
|
||||||
def plot_calls_data():
|
def plot_calls_data():
|
||||||
def plot_data_points(y, sr, data_points, file_path):
|
def plot_data_points(y, sr, data_points, file_path):
|
||||||
|
|
@ -252,9 +305,8 @@ def analyze(
|
||||||
plot_data_points(y, sr, data_points, str(file_path))
|
plot_data_points(y, sr, data_points, str(file_path))
|
||||||
return file_path
|
return file_path
|
||||||
|
|
||||||
# plot_call(retrieve_callmeta("http://saasdev.agaralabs.com/calls/JOR9V47L03AGUEL"))
|
|
||||||
call_lens = lens["users"].Each()["calls"].Each()
|
call_lens = lens["users"].Each()["calls"].Each()
|
||||||
call_stats = call_lens.modify(retrieve_callmeta)(call_logs)
|
call_stats = call_lens.modify(retrieve_processed_callmeta)(call_logs)
|
||||||
# call_plot_data = call_lens.collect()(call_stats)
|
# call_plot_data = call_lens.collect()(call_stats)
|
||||||
call_plots = call_lens.modify(plot_call)(call_stats)
|
call_plots = call_lens.modify(plot_call)(call_stats)
|
||||||
# with ThreadPoolExecutor(max_workers=20) as exe:
|
# with ThreadPoolExecutor(max_workers=20) as exe:
|
||||||
|
|
@ -285,7 +337,7 @@ def analyze(
|
||||||
yield extracted_code, code_seg.duration_seconds, code_wav
|
yield extracted_code, code_seg.duration_seconds, code_wav
|
||||||
|
|
||||||
call_lens = lens["users"].Each()["calls"].Each()
|
call_lens = lens["users"].Each()["calls"].Each()
|
||||||
call_stats = call_lens.modify(retrieve_callmeta)(call_logs)
|
call_stats = call_lens.modify(retrieve_processed_callmeta)(call_logs)
|
||||||
call_objs = call_lens.collect()(call_stats)
|
call_objs = call_lens.collect()(call_stats)
|
||||||
|
|
||||||
def data_source():
|
def data_source():
|
||||||
|
|
@ -315,7 +367,7 @@ def analyze(
|
||||||
}
|
}
|
||||||
|
|
||||||
call_lens = lens["users"].Each()["calls"].Each()
|
call_lens = lens["users"].Each()["calls"].Each()
|
||||||
call_stats = call_lens.modify(retrieve_callmeta)(call_logs)
|
call_stats = call_lens.modify(retrieve_processed_callmeta)(call_logs)
|
||||||
user_stats = lens["users"].Each().modify(compute_user_stats)(call_stats)
|
user_stats = lens["users"].Each().modify(compute_user_stats)(call_stats)
|
||||||
leader_df = (
|
leader_df = (
|
||||||
pd.DataFrame(user_stats["users"])
|
pd.DataFrame(user_stats["users"])
|
||||||
|
|
@ -338,6 +390,9 @@ def analyze(
|
||||||
)
|
)
|
||||||
print(leader_board.to_string(index=False))
|
print(leader_board.to_string(index=False))
|
||||||
|
|
||||||
|
if download_only:
|
||||||
|
download_meta_audio()
|
||||||
|
return
|
||||||
if leaderboard:
|
if leaderboard:
|
||||||
show_leaderboard()
|
show_leaderboard()
|
||||||
if plot_calls:
|
if plot_calls:
|
||||||
|
|
|
||||||
|
|
@ -104,9 +104,9 @@ class ExtendedPath(type(Path())):
|
||||||
return json.dump(data, jf, indent=2)
|
return json.dump(data, jf, indent=2)
|
||||||
|
|
||||||
|
|
||||||
def get_mongo_conn(host=''):
|
def get_mongo_conn(host='', port=27017):
|
||||||
mongo_host = host if host else os.environ.get("MONGO_HOST", "localhost")
|
mongo_host = host if host else os.environ.get("MONGO_HOST", "localhost")
|
||||||
mongo_uri = f"mongodb://{mongo_host}:27017/"
|
mongo_uri = f"mongodb://{mongo_host}:{port}/"
|
||||||
return pymongo.MongoClient(mongo_uri)
|
return pymongo.MongoClient(mongo_uri)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue