From fca9c1aeb339699a2d1c6882cdf1ebe885bd6d54 Mon Sep 17 00:00:00 2001 From: Malar Kannan Date: Thu, 21 May 2020 16:47:45 +0530 Subject: [PATCH] refactored module structure --- .gitignore | 4 +- .../validation/jasper_client.py => client.py} | 0 jasper/{data_utils => data}/__init__.py | 0 jasper/{data_utils => data}/asr_recycler.py | 0 jasper/{data_utils => data}/call_recycler.py | 7 +- jasper/{data_utils => data}/process.py | 0 .../data_server.py => data/server.py} | 0 jasper/{data_utils => data}/tts/__init__.py | 0 jasper/{data_utils => data}/tts/googletts.py | 0 jasper/{data_utils => data}/tts/ttsclient.py | 0 .../generator.py => data/tts_generator.py} | 0 jasper/{data_utils => data}/utils.py | 0 .../validation/process.py | 2 +- .../validation/st_rerun.py | 0 jasper/{data_utils => data}/validation/ui.py | 0 jasper/data_utils/parallel.py | 30 -------- jasper/data_utils/validation/orig_ui.py | 73 ------------------- .../{training_utils => training}/__init__.py | 0 .../train.py => training/cli.py} | 0 .../data_loaders.py | 0 .../featurizer.py | 0 setup.py | 14 ++-- streamlit.py | 2 +- 23 files changed, 17 insertions(+), 115 deletions(-) rename jasper/{data_utils/validation/jasper_client.py => client.py} (100%) rename jasper/{data_utils => data}/__init__.py (100%) rename jasper/{data_utils => data}/asr_recycler.py (100%) rename jasper/{data_utils => data}/call_recycler.py (98%) rename jasper/{data_utils => data}/process.py (100%) rename jasper/{data_utils/data_server.py => data/server.py} (100%) rename jasper/{data_utils => data}/tts/__init__.py (100%) rename jasper/{data_utils => data}/tts/googletts.py (100%) rename jasper/{data_utils => data}/tts/ttsclient.py (100%) rename jasper/{data_utils/generator.py => data/tts_generator.py} (100%) rename jasper/{data_utils => data}/utils.py (100%) rename jasper/{data_utils => data}/validation/process.py (99%) rename jasper/{data_utils => data}/validation/st_rerun.py (100%) rename jasper/{data_utils => data}/validation/ui.py (100%) delete mode 100644 jasper/data_utils/parallel.py delete mode 100644 jasper/data_utils/validation/orig_ui.py rename jasper/{training_utils => training}/__init__.py (100%) rename jasper/{training_utils/train.py => training/cli.py} (100%) rename jasper/{training_utils => training}/data_loaders.py (100%) rename jasper/{training_utils => training}/featurizer.py (100%) diff --git a/.gitignore b/.gitignore index f5adf10..8900361 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ -data/ +/data/ +/model/ +/train/ .env* *.yaml diff --git a/jasper/data_utils/validation/jasper_client.py b/jasper/client.py similarity index 100% rename from jasper/data_utils/validation/jasper_client.py rename to jasper/client.py diff --git a/jasper/data_utils/__init__.py b/jasper/data/__init__.py similarity index 100% rename from jasper/data_utils/__init__.py rename to jasper/data/__init__.py diff --git a/jasper/data_utils/asr_recycler.py b/jasper/data/asr_recycler.py similarity index 100% rename from jasper/data_utils/asr_recycler.py rename to jasper/data/asr_recycler.py diff --git a/jasper/data_utils/call_recycler.py b/jasper/data/call_recycler.py similarity index 98% rename from jasper/data_utils/call_recycler.py rename to jasper/data/call_recycler.py index 5989d33..8bf9255 100644 --- a/jasper/data_utils/call_recycler.py +++ b/jasper/data/call_recycler.py @@ -131,10 +131,11 @@ def analyze( call_logs = yaml.load(call_logs_file.read_text()) def get_call_meta(call_obj): - s3_event_url_p = urlsplit(call_obj["DataURI"]) + meta_s3_uri = call_obj["DataURI"] + s3_event_url_p = urlsplit(meta_s3_uri) saved_meta_path = call_meta_dir / Path(Path(s3_event_url_p.path).name) if not saved_meta_path.exists(): - print(f"downloading : {saved_meta_path}") + print(f"downloading : {saved_meta_path} from {meta_s3_uri}") s3.download_file( s3_event_url_p.netloc, s3_event_url_p.path[1:], str(saved_meta_path) ) @@ -206,7 +207,7 @@ def analyze( utter_events = uevs[: ev_count - ev_count % 3] saved_wav_path = call_media_dir / Path(Path(s3_wav_url_p.path).name) if not saved_wav_path.exists(): - print(f"downloading : {saved_wav_path}") + print(f"downloading : {saved_wav_path} from {s3_wav_url}") s3.download_file( s3_wav_url_p.netloc, s3_wav_url_p.path[1:], str(saved_wav_path) ) diff --git a/jasper/data_utils/process.py b/jasper/data/process.py similarity index 100% rename from jasper/data_utils/process.py rename to jasper/data/process.py diff --git a/jasper/data_utils/data_server.py b/jasper/data/server.py similarity index 100% rename from jasper/data_utils/data_server.py rename to jasper/data/server.py diff --git a/jasper/data_utils/tts/__init__.py b/jasper/data/tts/__init__.py similarity index 100% rename from jasper/data_utils/tts/__init__.py rename to jasper/data/tts/__init__.py diff --git a/jasper/data_utils/tts/googletts.py b/jasper/data/tts/googletts.py similarity index 100% rename from jasper/data_utils/tts/googletts.py rename to jasper/data/tts/googletts.py diff --git a/jasper/data_utils/tts/ttsclient.py b/jasper/data/tts/ttsclient.py similarity index 100% rename from jasper/data_utils/tts/ttsclient.py rename to jasper/data/tts/ttsclient.py diff --git a/jasper/data_utils/generator.py b/jasper/data/tts_generator.py similarity index 100% rename from jasper/data_utils/generator.py rename to jasper/data/tts_generator.py diff --git a/jasper/data_utils/utils.py b/jasper/data/utils.py similarity index 100% rename from jasper/data_utils/utils.py rename to jasper/data/utils.py diff --git a/jasper/data_utils/validation/process.py b/jasper/data/validation/process.py similarity index 99% rename from jasper/data_utils/validation/process.py rename to jasper/data/validation/process.py index 2dc3daa..a911684 100644 --- a/jasper/data_utils/validation/process.py +++ b/jasper/data/validation/process.py @@ -22,7 +22,7 @@ def preprocess_datapoint(idx, rel_root, sample, use_domain_asr): import librosa.display from pydub import AudioSegment from nemo.collections.asr.metrics import word_error_rate - from jasper.data_utils.validation.jasper_client import ( + from jasper.client import ( transcriber_pretrained, transcriber_speller, ) diff --git a/jasper/data_utils/validation/st_rerun.py b/jasper/data/validation/st_rerun.py similarity index 100% rename from jasper/data_utils/validation/st_rerun.py rename to jasper/data/validation/st_rerun.py diff --git a/jasper/data_utils/validation/ui.py b/jasper/data/validation/ui.py similarity index 100% rename from jasper/data_utils/validation/ui.py rename to jasper/data/validation/ui.py diff --git a/jasper/data_utils/parallel.py b/jasper/data_utils/parallel.py deleted file mode 100644 index 99141b6..0000000 --- a/jasper/data_utils/parallel.py +++ /dev/null @@ -1,30 +0,0 @@ -import concurrent.futures -import urllib.request - -URLS = [ - "http://www.foxnews.com/", - "http://www.cnn.com/", - "http://europe.wsj.com/", - "http://www.bbc.co.uk/", - "http://some-made-up-domain.com/", -] - - -# Retrieve a single page and report the URL and contents -def load_url(url, timeout): - with urllib.request.urlopen(url, timeout=timeout) as conn: - return conn.read() - - -# We can use a with statement to ensure threads are cleaned up promptly -with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: - # Start the load operations and mark each future with its URL - future_to_url = {executor.submit(load_url, url, 60): url for url in URLS} - for future in concurrent.futures.as_completed(future_to_url): - url = future_to_url[future] - try: - data = future.result() - except Exception as exc: - print("%r generated an exception: %s" % (url, exc)) - else: - print("%r page is %d bytes" % (url, len(data))) diff --git a/jasper/data_utils/validation/orig_ui.py b/jasper/data_utils/validation/orig_ui.py deleted file mode 100644 index 95a23a9..0000000 --- a/jasper/data_utils/validation/orig_ui.py +++ /dev/null @@ -1,73 +0,0 @@ -import json -from pathlib import Path -import streamlit as st - -# import matplotlib.pyplot as plt -# import numpy as np -import librosa -import librosa.display -from pydub import AudioSegment -from jasper.client import transcriber_pretrained, transcriber_speller - -# from pymongo import MongoClient - -st.title("ASR Speller Validation") -dataset_path: Path = Path("/dataset/asr_data/call_alphanum_v3") -manifest_path = dataset_path / Path("test_manifest.json") -# print(manifest_path) -with manifest_path.open("r") as pf: - pnr_jsonl = pf.readlines() - pnr_data = [json.loads(i) for i in pnr_jsonl] - - -def main(): - # pnr_data = MongoClient("mongodb://localhost:27017/").test.asr_pnr - # sample_no = 0 - sample_no = ( - st.slider( - "Sample", - min_value=1, - max_value=len(pnr_data), - value=1, - step=1, - format=None, - key=None, - ) - - 1 - ) - sample = pnr_data[sample_no] - st.write(f"Sample No: {sample_no+1} of {len(pnr_data)}") - audio_path = Path(sample["audio_filepath"]) - # st.write(f"Audio Path:{audio_path}") - aud_seg = AudioSegment.from_wav(audio_path) # .set_channels(1).set_sample_width(2).set_frame_rate(24000) - st.sidebar.text("Transcription") - st.sidebar.text(f"Pretrained:{transcriber_pretrained(aud_seg.raw_data)}") - st.sidebar.text(f"Speller:{transcriber_speller(aud_seg.raw_data)}") - st.sidebar.text(f"Expected: {audio_path.stem}") - spell_text = sample["text"] - st.sidebar.text(f"Spelled: {spell_text}") - st.audio(audio_path.open("rb")) - selected = st.radio("The Audio is", ("Correct", "Incorrect", "Inaudible")) - corrected = audio_path.stem - if selected == "Incorrect": - corrected = st.text_input("Actual:", value=corrected) - # content = '' - if sample_no > 0 and st.button("Previous"): - sample_no -= 1 - if st.button("Next"): - st.write(sample_no, selected, corrected) - sample_no += 1 - - (y, sr) = librosa.load(audio_path) - librosa.display.waveplot(y=y, sr=sr) - # arr = np.random.normal(1, 1, size=100) - # plt.hist(arr, bins=20) - st.sidebar.pyplot() - - -# def main(): -# app() - - -if __name__ == "__main__": - main() diff --git a/jasper/training_utils/__init__.py b/jasper/training/__init__.py similarity index 100% rename from jasper/training_utils/__init__.py rename to jasper/training/__init__.py diff --git a/jasper/training_utils/train.py b/jasper/training/cli.py similarity index 100% rename from jasper/training_utils/train.py rename to jasper/training/cli.py diff --git a/jasper/training_utils/data_loaders.py b/jasper/training/data_loaders.py similarity index 100% rename from jasper/training_utils/data_loaders.py rename to jasper/training/data_loaders.py diff --git a/jasper/training_utils/featurizer.py b/jasper/training/featurizer.py similarity index 100% rename from jasper/training_utils/featurizer.py rename to jasper/training/featurizer.py diff --git a/setup.py b/setup.py index 6b3e97b..3900b78 100644 --- a/setup.py +++ b/setup.py @@ -60,12 +60,14 @@ setup( entry_points={ "console_scripts": [ "jasper_transcribe = jasper.transcribe:main", - "jasper_asr_rpyc_server = jasper.server:main", - "jasper_asr_trainer = jasper.training_utils.train:main", - "jasper_asr_data_generate = jasper.data_utils.generator:main", - "jasper_asr_data_recycle = jasper.data_utils.call_recycler:main", - "jasper_asr_data_validation = jasper.data_utils.validation.process:main", - "jasper_asr_data_preprocess = jasper.data_utils.process:main", + "jasper_server = jasper.server:main", + "jasper_trainer = jasper.training.cli:main", + "jasper_data_generate = jasper.data.tts_generator:main", + "jasper_data_call_recycle = jasper.data.call_recycler:main", + "jasper_data_asr_recycle = jasper.data.asr_recycler:main", + "jasper_data_server = jasper.data.server:main", + "jasper_data_validation = jasper.data.validation.process:main", + "jasper_data_preprocess = jasper.data.process:main", ] }, zip_safe=False, diff --git a/streamlit.py b/streamlit.py index 58ba0bd..b45692e 100644 --- a/streamlit.py +++ b/streamlit.py @@ -1,3 +1,3 @@ import runpy -runpy.run_module("jasper.data_utils.validation.ui", run_name="__main__", alter_sys=True) +runpy.run_module("jasper.data.validation.ui", run_name="__main__", alter_sys=True)