refactored module structure
parent
2d5b720284
commit
fca9c1aeb3
|
|
@ -1,4 +1,6 @@
|
||||||
data/
|
/data/
|
||||||
|
/model/
|
||||||
|
/train/
|
||||||
.env*
|
.env*
|
||||||
*.yaml
|
*.yaml
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -131,10 +131,11 @@ def analyze(
|
||||||
call_logs = yaml.load(call_logs_file.read_text())
|
call_logs = yaml.load(call_logs_file.read_text())
|
||||||
|
|
||||||
def get_call_meta(call_obj):
|
def get_call_meta(call_obj):
|
||||||
s3_event_url_p = urlsplit(call_obj["DataURI"])
|
meta_s3_uri = call_obj["DataURI"]
|
||||||
|
s3_event_url_p = urlsplit(meta_s3_uri)
|
||||||
saved_meta_path = call_meta_dir / Path(Path(s3_event_url_p.path).name)
|
saved_meta_path = call_meta_dir / Path(Path(s3_event_url_p.path).name)
|
||||||
if not saved_meta_path.exists():
|
if not saved_meta_path.exists():
|
||||||
print(f"downloading : {saved_meta_path}")
|
print(f"downloading : {saved_meta_path} from {meta_s3_uri}")
|
||||||
s3.download_file(
|
s3.download_file(
|
||||||
s3_event_url_p.netloc, s3_event_url_p.path[1:], str(saved_meta_path)
|
s3_event_url_p.netloc, s3_event_url_p.path[1:], str(saved_meta_path)
|
||||||
)
|
)
|
||||||
|
|
@ -206,7 +207,7 @@ def analyze(
|
||||||
utter_events = uevs[: ev_count - ev_count % 3]
|
utter_events = uevs[: ev_count - ev_count % 3]
|
||||||
saved_wav_path = call_media_dir / Path(Path(s3_wav_url_p.path).name)
|
saved_wav_path = call_media_dir / Path(Path(s3_wav_url_p.path).name)
|
||||||
if not saved_wav_path.exists():
|
if not saved_wav_path.exists():
|
||||||
print(f"downloading : {saved_wav_path}")
|
print(f"downloading : {saved_wav_path} from {s3_wav_url}")
|
||||||
s3.download_file(
|
s3.download_file(
|
||||||
s3_wav_url_p.netloc, s3_wav_url_p.path[1:], str(saved_wav_path)
|
s3_wav_url_p.netloc, s3_wav_url_p.path[1:], str(saved_wav_path)
|
||||||
)
|
)
|
||||||
|
|
@ -22,7 +22,7 @@ def preprocess_datapoint(idx, rel_root, sample, use_domain_asr):
|
||||||
import librosa.display
|
import librosa.display
|
||||||
from pydub import AudioSegment
|
from pydub import AudioSegment
|
||||||
from nemo.collections.asr.metrics import word_error_rate
|
from nemo.collections.asr.metrics import word_error_rate
|
||||||
from jasper.data_utils.validation.jasper_client import (
|
from jasper.client import (
|
||||||
transcriber_pretrained,
|
transcriber_pretrained,
|
||||||
transcriber_speller,
|
transcriber_speller,
|
||||||
)
|
)
|
||||||
|
|
@ -1,30 +0,0 @@
|
||||||
import concurrent.futures
|
|
||||||
import urllib.request
|
|
||||||
|
|
||||||
URLS = [
|
|
||||||
"http://www.foxnews.com/",
|
|
||||||
"http://www.cnn.com/",
|
|
||||||
"http://europe.wsj.com/",
|
|
||||||
"http://www.bbc.co.uk/",
|
|
||||||
"http://some-made-up-domain.com/",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# Retrieve a single page and report the URL and contents
|
|
||||||
def load_url(url, timeout):
|
|
||||||
with urllib.request.urlopen(url, timeout=timeout) as conn:
|
|
||||||
return conn.read()
|
|
||||||
|
|
||||||
|
|
||||||
# We can use a with statement to ensure threads are cleaned up promptly
|
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
|
||||||
# Start the load operations and mark each future with its URL
|
|
||||||
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
|
|
||||||
for future in concurrent.futures.as_completed(future_to_url):
|
|
||||||
url = future_to_url[future]
|
|
||||||
try:
|
|
||||||
data = future.result()
|
|
||||||
except Exception as exc:
|
|
||||||
print("%r generated an exception: %s" % (url, exc))
|
|
||||||
else:
|
|
||||||
print("%r page is %d bytes" % (url, len(data)))
|
|
||||||
|
|
@ -1,73 +0,0 @@
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
import streamlit as st
|
|
||||||
|
|
||||||
# import matplotlib.pyplot as plt
|
|
||||||
# import numpy as np
|
|
||||||
import librosa
|
|
||||||
import librosa.display
|
|
||||||
from pydub import AudioSegment
|
|
||||||
from jasper.client import transcriber_pretrained, transcriber_speller
|
|
||||||
|
|
||||||
# from pymongo import MongoClient
|
|
||||||
|
|
||||||
st.title("ASR Speller Validation")
|
|
||||||
dataset_path: Path = Path("/dataset/asr_data/call_alphanum_v3")
|
|
||||||
manifest_path = dataset_path / Path("test_manifest.json")
|
|
||||||
# print(manifest_path)
|
|
||||||
with manifest_path.open("r") as pf:
|
|
||||||
pnr_jsonl = pf.readlines()
|
|
||||||
pnr_data = [json.loads(i) for i in pnr_jsonl]
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
# pnr_data = MongoClient("mongodb://localhost:27017/").test.asr_pnr
|
|
||||||
# sample_no = 0
|
|
||||||
sample_no = (
|
|
||||||
st.slider(
|
|
||||||
"Sample",
|
|
||||||
min_value=1,
|
|
||||||
max_value=len(pnr_data),
|
|
||||||
value=1,
|
|
||||||
step=1,
|
|
||||||
format=None,
|
|
||||||
key=None,
|
|
||||||
)
|
|
||||||
- 1
|
|
||||||
)
|
|
||||||
sample = pnr_data[sample_no]
|
|
||||||
st.write(f"Sample No: {sample_no+1} of {len(pnr_data)}")
|
|
||||||
audio_path = Path(sample["audio_filepath"])
|
|
||||||
# st.write(f"Audio Path:{audio_path}")
|
|
||||||
aud_seg = AudioSegment.from_wav(audio_path) # .set_channels(1).set_sample_width(2).set_frame_rate(24000)
|
|
||||||
st.sidebar.text("Transcription")
|
|
||||||
st.sidebar.text(f"Pretrained:{transcriber_pretrained(aud_seg.raw_data)}")
|
|
||||||
st.sidebar.text(f"Speller:{transcriber_speller(aud_seg.raw_data)}")
|
|
||||||
st.sidebar.text(f"Expected: {audio_path.stem}")
|
|
||||||
spell_text = sample["text"]
|
|
||||||
st.sidebar.text(f"Spelled: {spell_text}")
|
|
||||||
st.audio(audio_path.open("rb"))
|
|
||||||
selected = st.radio("The Audio is", ("Correct", "Incorrect", "Inaudible"))
|
|
||||||
corrected = audio_path.stem
|
|
||||||
if selected == "Incorrect":
|
|
||||||
corrected = st.text_input("Actual:", value=corrected)
|
|
||||||
# content = ''
|
|
||||||
if sample_no > 0 and st.button("Previous"):
|
|
||||||
sample_no -= 1
|
|
||||||
if st.button("Next"):
|
|
||||||
st.write(sample_no, selected, corrected)
|
|
||||||
sample_no += 1
|
|
||||||
|
|
||||||
(y, sr) = librosa.load(audio_path)
|
|
||||||
librosa.display.waveplot(y=y, sr=sr)
|
|
||||||
# arr = np.random.normal(1, 1, size=100)
|
|
||||||
# plt.hist(arr, bins=20)
|
|
||||||
st.sidebar.pyplot()
|
|
||||||
|
|
||||||
|
|
||||||
# def main():
|
|
||||||
# app()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
14
setup.py
14
setup.py
|
|
@ -60,12 +60,14 @@ setup(
|
||||||
entry_points={
|
entry_points={
|
||||||
"console_scripts": [
|
"console_scripts": [
|
||||||
"jasper_transcribe = jasper.transcribe:main",
|
"jasper_transcribe = jasper.transcribe:main",
|
||||||
"jasper_asr_rpyc_server = jasper.server:main",
|
"jasper_server = jasper.server:main",
|
||||||
"jasper_asr_trainer = jasper.training_utils.train:main",
|
"jasper_trainer = jasper.training.cli:main",
|
||||||
"jasper_asr_data_generate = jasper.data_utils.generator:main",
|
"jasper_data_generate = jasper.data.tts_generator:main",
|
||||||
"jasper_asr_data_recycle = jasper.data_utils.call_recycler:main",
|
"jasper_data_call_recycle = jasper.data.call_recycler:main",
|
||||||
"jasper_asr_data_validation = jasper.data_utils.validation.process:main",
|
"jasper_data_asr_recycle = jasper.data.asr_recycler:main",
|
||||||
"jasper_asr_data_preprocess = jasper.data_utils.process:main",
|
"jasper_data_server = jasper.data.server:main",
|
||||||
|
"jasper_data_validation = jasper.data.validation.process:main",
|
||||||
|
"jasper_data_preprocess = jasper.data.process:main",
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,3 @@
|
||||||
import runpy
|
import runpy
|
||||||
|
|
||||||
runpy.run_module("jasper.data_utils.validation.ui", run_name="__main__", alter_sys=True)
|
runpy.run_module("jasper.data.validation.ui", run_name="__main__", alter_sys=True)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue