1. Self contained typers

2. Asr force-aligner visualization 3. streamlit state management abstraction 4. new utils / reorganize 5. added verbose flags 6. add tts by name
2026-03-07 20:02:34 +00:00 · 2021-03-23 13:27:35 +05:30
parent f72c6bbe5b
commit c474aa5f5a
22 changed files with 1097 additions and 146 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+graft plume/utils/gentle_preview
--- a/plume/cli/init.py
+++ b/plume/cli/init.py
@@ -7,12 +7,12 @@ from .eval import app as eval_app
 from .serve import app as serve_app

 app = typer.Typer()
-app.add_typer(data_app, name="data")
-app.add_typer(ui_app, name="ui")
-app.add_typer(train_app, name="train")
-app.add_typer(eval_app, name="eval")
-app.add_typer(serve_app, name="serve")
-app.add_typer(utils_app, name='utils')
+app.add_typer(data_app)
+app.add_typer(ui_app)
+app.add_typer(train_app)
+app.add_typer(eval_app)
+app.add_typer(serve_app)
+app.add_typer(utils_app)


 def main():
--- a/plume/cli/data/init.py
+++ b/plume/cli/data/init.py
@@ -27,6 +27,13 @@ app.add_typer(generate_app, name="generate")
 app.add_typer(wav2vec2_app, name="wav2vec2")


+@app.callback()
+def data():
+    """
+    data sub commands
+    """
+
+
@app.command()
 def fix_path(dataset_path: Path, force: bool = False):
    manifest_path = dataset_path / Path("manifest.json")
--- a/plume/cli/eval.py
+++ b/plume/cli/eval.py
@@ -3,3 +3,10 @@ from ..models.wav2vec2.eval import app as wav2vec2_app

 app = typer.Typer()
 app.add_typer(wav2vec2_app, name="wav2vec2")
+
+
+@app.callback()
+def eval():
+    """
+    eval sub commands
+    """
--- a/plume/cli/serve.py
+++ b/plume/cli/serve.py
@@ -5,3 +5,10 @@ from ..models.jasper.serve import app as jasper_app
 app = typer.Typer()
 app.add_typer(wav2vec2_app, name="wav2vec2")
 app.add_typer(jasper_app, name="jasper")
+
+
+@app.callback()
+def serve():
+    """
+    serve sub commands
+    """
--- a/plume/cli/train.py
+++ b/plume/cli/train.py
@@ -1,5 +1,12 @@
 import typer
-from ..models.wav2vec2.train import app as train_app
+from ..models.wav2vec2.train import app as wav2vec2_app

 app = typer.Typer()
-app.add_typer(train_app, name="wav2vec2")
+app.add_typer(wav2vec2_app, name="wav2vec2")
+
+
+@app.callback()
+def train():
+    """
+    train sub commands
+    """
--- a/plume/ui/init.py
+++ b/plume/ui/init.py
@@ -3,12 +3,20 @@ import sys
 from pathlib import Path

 from plume.utils import lazy_module
+
 # from streamlit import cli as stcli

-stcli = lazy_module('streamlit.cli')
+stcli = lazy_module("streamlit.cli")
 app = typer.Typer()


+@app.callback()
+def ui():
+    """
+    ui sub commands
+    """
+
+
@app.command()
 def annotation(data_dir: Path, dump_fname: Path = "ui_dump.json", task_id: str = ""):
    annotation_lit_path = Path(__file__).parent / Path("annotation.py")
@@ -40,13 +48,7 @@ def annotation(data_dir: Path, dump_fname: Path = "ui_dump.json", task_id: str =
@app.command()
 def preview(manifest_path: Path):
    annotation_lit_path = Path(__file__).parent / Path("preview.py")
-    sys.argv = [
-        "streamlit",
-        "run",
-        str(annotation_lit_path),
-        "--",
-        str(manifest_path)
-    ]
+    sys.argv = ["streamlit", "run", str(annotation_lit_path), "--", str(manifest_path)]
    sys.exit(stcli.main())


@@ -56,6 +58,18 @@ def collection(data_dir: Path, task_id: str = ""):
    pass


+@app.command()
+def alignment(preview_dir: Path, port: int = 8010):
+    from RangeHTTPServer import RangeRequestHandler
+    from functools import partial
+    from http.server import HTTPServer
+
+    server_address = ("", port)
+    handler_class = partial(RangeRequestHandler, directory=str(preview_dir))
+    httpd = HTTPServer(server_address, handler_class)
+    httpd.serve_forever()
+
+
 def main():
    app()

--- a/plume/ui/annotation.py
+++ b/plume/ui/annotation.py
@@ -1,66 +1,14 @@
 # import sys
 from pathlib import Path
-from uuid import uuid4

 import streamlit as st
 import typer
-
-from plume.utils import ExtendedPath, get_mongo_conn
-from plume.preview.st_rerun import rerun
+from plume.utils import ExtendedPath
+from plume.utils.ui_persist import setup_mongo_asr_validation_state

 app = typer.Typer()

-
-if not hasattr(st, "mongo_connected"):
-    st.mongoclient = get_mongo_conn(col="asr_validation")
-    mongo_conn = st.mongoclient
-    st.task_id = str(uuid4())
-
-    def current_cursor_fn():
-        # mongo_conn = st.mongoclient
-        cursor_obj = mongo_conn.find_one(
-            {"type": "current_cursor", "task_id": st.task_id}
-        )
-        cursor_val = cursor_obj["cursor"]
-        return cursor_val
-
-    def update_cursor_fn(val=0):
-        mongo_conn.find_one_and_update(
-            {"type": "current_cursor", "task_id": st.task_id},
-            {"$set": {"type": "current_cursor", "task_id": st.task_id, "cursor": val}},
-            upsert=True,
-        )
-        rerun()
-
-    def get_correction_entry_fn(code):
-        return mongo_conn.find_one(
-            {"type": "correction", "code": code}, projection={"_id": False}
-        )
-
-    def update_entry_fn(code, value):
-        mongo_conn.find_one_and_update(
-            {"type": "correction", "code": code},
-            {"$set": {"value": value, "task_id": st.task_id}},
-            upsert=True,
-        )
-
-    def set_task_fn(data_path, task_id):
-        if task_id:
-            st.task_id = task_id
-        task_path = data_path / Path(f"task-{st.task_id}.lck")
-        if not task_path.exists():
-            print(f"creating task lock at {task_path}")
-            task_path.touch()
-
-    st.get_current_cursor = current_cursor_fn
-    st.update_cursor = update_cursor_fn
-    st.get_correction_entry = get_correction_entry_fn
-    st.update_entry = update_entry_fn
-    st.set_task = set_task_fn
-    st.mongo_connected = True
-    cursor_obj = mongo_conn.find_one({"type": "current_cursor", "task_id": st.task_id})
-    if not cursor_obj:
-        update_cursor_fn(0)
+setup_mongo_asr_validation_state(st)


@st.cache()
--- a/plume/ui/preview.py
+++ b/plume/ui/preview.py
@@ -3,27 +3,11 @@ from pathlib import Path
 import streamlit as st
 import typer
 from plume.utils import ExtendedPath
-from plume.preview.st_rerun import rerun
+from plume.utils.ui_persist import setup_file_state

 app = typer.Typer()

-if not hasattr(st, "state_lock"):
-    # st.task_id = str(uuid4())
-    task_path = ExtendedPath("preview.lck")
-
-    def current_cursor_fn():
-        return task_path.read_json()["current_cursor"]
-
-    def update_cursor_fn(val=0):
-        task_path.write_json({"current_cursor": val})
-        rerun()
-
-    st.get_current_cursor = current_cursor_fn
-    st.update_cursor = update_cursor_fn
-    st.state_lock = True
-    # cursor_obj = mongo_conn.find_one({"type": "current_cursor", "task_id": st.task_id})
-    # if not cursor_obj:
-    update_cursor_fn(0)
+setup_file_state(st)


@st.cache()
@@ -40,7 +24,7 @@ def main(manifest: Path):
        print("Invalid samplno resetting to 0")
        st.update_cursor(0)
    sample = asr_data[sample_no]
-    st.title(f"ASR Manifest Preview")
+    st.title("ASR Manifest Preview")
    st.markdown(f"{sample_no+1} of {len(asr_data)} : **{sample['text']}**")
    new_sample = st.number_input(
        "Go To Sample:", value=sample_no + 1, min_value=1, max_value=len(asr_data)
--- a/plume/utils/.gitignore
+++ b/plume/utils/.gitignore
@@ -0,0 +1,151 @@
+/data/
+/model/
+/train/
+.env*
+*.yaml
+*.yml
+*.json
+
+
+# Created by https://www.gitignore.io/api/python
+# Edit at https://www.gitignore.io/?templates=python
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# End of https://www.gitignore.io/api/python
+
+# Created by https://www.gitignore.io/api/macos
+# Edit at https://www.gitignore.io/?templates=macos
+
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+# End of https://www.gitignore.io/api/macos
--- a/plume/utils/init.py
+++ b/plume/utils/init.py
@@ -11,12 +11,14 @@ from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 import subprocess
 import shutil
 from urllib.parse import urlsplit
+
 # from .lazy_loader import LazyLoader
 from .lazy_import import lazy_callable, lazy_module

 # from ruamel.yaml import YAML
 # import boto3
 import typer
+
 # import pymongo
 # from slugify import slugify
 # import pydub
@@ -34,16 +36,16 @@ from .tts import app as tts_app
 from .transcribe import app as transcribe_app
 from .align import app as align_app

-boto3 = lazy_module('boto3')
-pymongo = lazy_module('pymongo')
-pydub = lazy_module('pydub')
-audio_display = lazy_module('librosa.display')
-plt = lazy_module('matplotlib.pyplot')
-librosa = lazy_module('librosa')
-YAML = lazy_callable('ruamel.yaml.YAML')
-num2words = lazy_callable('num2words.num2words')
-slugify = lazy_callable('slugify.slugify')
-compress = lazy_callable('natural.date.compress')
+boto3 = lazy_module("boto3")
+pymongo = lazy_module("pymongo")
+pydub = lazy_module("pydub")
+audio_display = lazy_module("librosa.display")
+plt = lazy_module("matplotlib.pyplot")
+librosa = lazy_module("librosa")
+YAML = lazy_callable("ruamel.yaml.YAML")
+num2words = lazy_callable("num2words.num2words")
+slugify = lazy_callable("slugify.slugify")
+compress = lazy_callable("natural.date.compress")

 app = typer.Typer()
 app.add_typer(tts_app, name="tts")
@@ -51,6 +53,13 @@ app.add_typer(align_app, name="align")
 app.add_typer(transcribe_app, name="transcribe")


+@app.callback()
+def utils():
+    """
+    utils sub commands
+    """
+
+
 logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
@@ -125,6 +134,10 @@ def upload_s3(dataset_path, s3_path):
    run_shell(f"aws s3 sync {dataset_path} {s3_path}")


+def copy_s3(dataset_path, s3_path):
+    run_shell(f"aws s3 cp {dataset_path} {s3_path}")
+
+
 def get_download_path(s3_uri, output_path):
    s3_uri_p = urlsplit(s3_uri)
    download_path = output_path / Path(s3_uri_p.path[1:])
@@ -135,11 +148,12 @@ def get_download_path(s3_uri, output_path):
 def s3_downloader():
    s3 = boto3.client("s3")

-    def download_s3(s3_uri, download_path):
+    def download_s3(s3_uri, download_path, verbose=False):
        s3_uri_p = urlsplit(s3_uri)
        download_path.parent.mkdir(exist_ok=True, parents=True)
        if not download_path.exists():
-            print(f"downloading {s3_uri} to {download_path}")
+            if verbose:
+                print(f"downloading {s3_uri} to {download_path}")
            s3.download_file(s3_uri_p.netloc, s3_uri_p.path[1:], str(download_path))

    return download_s3
@@ -186,6 +200,7 @@ def ui_data_generator(dataset_dir, asr_data_source, verbose=False):
            plot_seg(wav_plot_path.absolute(), audio_file)
        return {
            "audio_path": str(rel_data_path),
+            "audio_filepath": str(rel_data_path),
            "duration": round(audio_dur, 1),
            "text": transcript,
            "real_idx": num_datapoints,
@@ -229,17 +244,17 @@ def ui_dump_manifest_writer(dataset_dir, asr_data_source, verbose=False):
    )

    asr_manifest = dataset_dir / Path("manifest.json")
-    with asr_manifest.open("w") as mf:
-        print(f"writing manifest to {asr_manifest}")
-        for d in dump_data:
-            rel_data_path = d["audio_path"]
-            audio_dur = d["duration"]
-            transcript = d["text"]
-            manifest = manifest_str(str(rel_data_path), audio_dur, transcript)
-            mf.write(manifest)
-
+    asr_manifest_writer(asr_manifest, dump_data, verbose=verbose)
+    # with asr_manifest.open("w") as mf:
+    #     print(f"writing manifest to {asr_manifest}")
+    #     for d in dump_data:
+    #         rel_data_path = d["audio_path"]
+    #         audio_dur = d["duration"]
+    #         transcript = d["text"]
+    #         manifest = manifest_str(str(rel_data_path), audio_dur, transcript)
+    #         mf.write(manifest)
    ui_dump_file = dataset_dir / Path("ui_dump.json")
-    ExtendedPath(ui_dump_file).write_json({"data": dump_data})
+    ExtendedPath(ui_dump_file).write_json({"data": dump_data}, verbose=verbose)
    return num_datapoints


@@ -254,9 +269,10 @@ def asr_manifest_reader(data_manifest_path: Path):
        yield p


-def asr_manifest_writer(asr_manifest_path: Path, manifest_str_source):
+def asr_manifest_writer(asr_manifest_path: Path, manifest_str_source, verbose=False):
    with asr_manifest_path.open("w") as mf:
-        print(f"opening {asr_manifest_path} for writing manifest")
+        if verbose:
+            print(f"writing asr manifest to {asr_manifest_path}")
        for mani_dict in manifest_str_source:
            manifest = manifest_str(
                mani_dict["audio_filepath"], mani_dict["duration"], mani_dict["text"]
@@ -293,37 +309,43 @@ def batch(iterable, n=1):
 class ExtendedPath(type(Path())):
    """docstring for ExtendedPath."""

-    def read_json(self):
-        print(f"reading json from {self}")
+    def read_json(self, verbose=False):
+        if verbose:
+            print(f"reading json from {self}")
        with self.open("r") as jf:
            return json.load(jf)

-    def read_yaml(self):
+    def read_yaml(self, verbose=False):
        yaml = YAML(typ="safe", pure=True)
-        print(f"reading yaml from {self}")
+        if verbose:
+            print(f"reading yaml from {self}")
        with self.open("r") as yf:
            return yaml.load(yf)

-    def read_jsonl(self):
-        print(f"reading jsonl from {self}")
+    def read_jsonl(self, verbose=False):
+        if verbose:
+            print(f"reading jsonl from {self}")
        with self.open("r") as jf:
-            for l in jf.readlines():
-                yield json.loads(l)
+            for ln in jf.readlines():
+                yield json.loads(ln)

-    def write_json(self, data):
-        print(f"writing json to {self}")
+    def write_json(self, data, verbose=False):
+        if verbose:
+            print(f"writing json to {self}")
        self.parent.mkdir(parents=True, exist_ok=True)
        with self.open("w") as jf:
            json.dump(data, jf, indent=2)

-    def write_yaml(self, data):
+    def write_yaml(self, data, verbose=False):
        yaml = YAML()
-        print(f"writing yaml to {self}")
+        if verbose:
+            print(f"writing yaml to {self}")
        with self.open("w") as yf:
            yaml.dump(data, yf)

-    def write_jsonl(self, data):
-        print(f"writing jsonl to {self}")
+    def write_jsonl(self, data, verbose=False):
+        if verbose:
+            print(f"writing jsonl to {self}")
        self.parent.mkdir(parents=True, exist_ok=True)
        with self.open("w") as jf:
            for d in data:
--- a/plume/utils/align.py
+++ b/plume/utils/align.py
@@ -1,12 +1,14 @@
 from pathlib import Path
-from .tts import GoogleTTS
 # from IPython import display
 import requests
 import io
-import typer
+import shutil

+import typer
 from plume.utils import lazy_module

+from .tts import GoogleTTS
+
 display = lazy_module('IPython.display')
 pydub = lazy_module('pydub')

@@ -63,16 +65,19 @@ def gentle_preview(
    audio_path: Path,
    transcript_path: Path,
    service_uri="http://101.53.142.218:8765/transcriptions",
-    gent_preview_dir="../gentle_preview",
+    gent_preview_dir="./gentle_preview",
 ):
    from . import ExtendedPath

-    ab = audio_path.read_bytes()
-    tt = transcript_path.read_text()
-    audio, alignment = gentle_aligner(service_uri, ab, tt)
-    audio.export(gent_preview_dir / Path("a.wav"), format="wav")
-    alignment["status"] = "OK"
-    ExtendedPath(gent_preview_dir / Path("status.json")).write_json(alignment)
+    pkg_gentle_dir = Path(__file__).parent / 'gentle_preview'
+
+    shutil.copytree(str(pkg_gentle_dir), str(gent_preview_dir))
+    # ab = audio_path.read_bytes()
+    # tt = transcript_path.read_text()
+    # audio, alignment = gentle_aligner(service_uri, ab, tt)
+    # audio.export(gent_preview_dir / Path("a.wav"), format="wav")
+    # alignment["status"] = "OK"
+    # ExtendedPath(gent_preview_dir / Path("status.json")).write_json(alignment)


 def main():
--- a/plume/utils/gentle_preview/README.md
+++ b/plume/utils/gentle_preview/README.md
@@ -0,0 +1,5 @@
+Serve with https://github.com/danvk/RangeHTTPServer
+`https://github.com/claysciences/CORSRangeHTTPServer`
+
+`python -m RangeHTTPServer`
+`python -m http.server`
--- a/plume/utils/gentle_preview/align.html
+++ b/plume/utils/gentle_preview/align.html
@@ -0,0 +1,80 @@
+<html>
+  <head>
+    <meta charset="utf-8" />
+    <style>
+      body {font-family: sans-serif; padding-top: 70px; }
+      textarea { width: 500px; height: 20em; }
+      input, textarea { margin: 1em 0; }
+      #header {
+          position: fixed;
+          top: 0;
+          left: 0;
+          height: 50px;
+          line-height: 50px;
+          width: 100%;
+          background-color: #999;
+          box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5);
+          font-family: Helvetica, sans-serif;
+      }
+      #header, #header a {
+          color: white;
+      }
+      .home {
+          margin: 0;
+          font-size: 125%;
+          font-weight: lighter;
+          text-transform: lowercase;
+      }
+      .home a {
+          margin: 0;
+          background: #666;
+          padding-left: 25px;
+          padding-right: 30px;
+          margin-right: 20px;
+          float: left;
+          text-decoration: none;
+      }
+      .home:hover a {
+          background: #555;
+      }
+      #align-button {
+        background: #CCC;
+        border: 0;
+        font-size: 18px;
+        padding: 10px 30px;
+        cursor: pointer;
+      }
+      #alignment-flags {
+        background: #CCC;
+        border: 0;
+        font-size: 18px;
+        padding: 10px 30px;
+      }
+      #footer {
+        margin-top: 100px;
+        border-top: 1px dotted black;
+        font-size: 8pt;
+      font-style: italic;
+      padding: 10px;
+      }
+    </style>
+  </head>
+  <body>
+    <div id="header">
+      <h1 class="home"><a href="/">Gentle</a></h1>
+    </div>
+    <form action="/transcriptions" method="POST" enctype="multipart/form-data">
+      Audio:<br>
+      <input type=file name=audio><br>
+      <br>
+      Transcript:<br>
+      <textarea name="transcript"></textarea><br>
+      <input id=alignment-flags name=conservative type=checkbox> Conservative<br>
+      <input id=alignment-flags name=disfluency type=checkbox> Include disfluencies<br>
+      <input id="align-button" type=submit value=Align>
+    </form>
+    <div id="footer">
+      <a href="https://lowerquality.com/gentle">Gentle</a> is free software released under the <a href="https://opensource.org/licenses/MIT">MIT license</a>. <a href="https://lowerquality.com/gentle">Homepage</a> | <a href="https://github.com/lowerquality/gentle">Source code</a>.
+    </div>
+  </body>
+</html>
--- a/plume/utils/gentle_preview/index.html
+++ b/plume/utils/gentle_preview/index.html
@@ -0,0 +1,408 @@
+<html>
+  <head>
+    <meta charset="utf-8" />
+    <style>
+html, body {
+    margin: 0;
+    padding: 0;
+}
+#header {
+    position: fixed;
+    top: 0;
+    left: 0;
+    height: 50px;
+    line-height: 50px;
+    width: 100%;
+    background-color: #999;
+    box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5);
+    font-family: Helvetica, sans-serif;
+}
+#header, #header a {
+    color: white;
+}
+#downloads {
+    float: right;
+    background: #999;
+}
+.download {
+    float: right;
+    background: #999;
+    padding: 0 5px;
+}
+.home {
+  margin: 0;
+  font-size: 125%;
+  font-weight: lighter;
+  text-transform: lowercase;
+}
+.home a {
+  margin: 0;
+  background: #666;
+  padding-left: 25px;
+  padding-right: 30px;
+  margin-right: 20px;
+  float: left;
+  text-decoration: none;
+}
+.home:hover a {
+  background: #555;
+}
+#audio {
+    margin-top: 9px;
+    width: 50%;
+    display: inline-block;
+}
+#transcript {
+    margin: 0 15px;
+    margin-top: 70px;
+    margin-bottom: 5em;
+    white-space: pre-wrap;
+    line-height: 2em;
+    max-width: 600px;
+    color: #999;
+}
+#transcript.status {
+    background-color: #333;
+    color: #fff;
+    font-family: Courier, mono;
+    line-height: 1em;
+    font-size: 10pt;
+    max-width: 100%;
+}
+#transcript.status h2 {
+    padding: 10px;
+}
+#transcript.status .entry {
+    margin-bottom: 10px;
+    padding: 10px;
+}
+#transcript.status progress {
+    width: 100%;
+    height: 30px;
+    margin-bottom: 20px;
+}
+.success {
+    color: black;
+}
+.success:hover {
+    text-decoration: underline;
+}
+.active {
+    color: magenta;
+}
+#preloader {
+    visibility: hidden;
+}
+.phactive {
+    text-decoration: underline;
+}
+.phones {
+    position: absolute;
+    color: #333;
+}
+.phones .phone {
+    margin-right: 5px;
+    font-family: Helvetica, sans-serif;
+    text-transform: uppercase;
+    font-size: 50%;
+}
+.phones .phone:last-child {
+    margin-right: 0;
+}
+#footer {
+  margin-top: 100px;
+  border-top: 1px dotted black;
+  font-size: 8pt;
+  font-style: italic;
+  font-family: Helvetica, sans-serif;
+  padding: 10px;
+}
+    </style>
+  </head>
+  <body>
+    <div id="header">
+      <!-- <h1 class="home"><a href="/">Gentle</a></h1> -->
+      <audio id="audio" src="a.wav" controls="true" preload="auto"></audio>
+      <img src="/preloader.gif" id="preloader" alt="loading...">
+      <span id="downloads"> </div>
+    </div>
+    <div id="transcript"></div>
+    <!-- <div id="footer">
+      <a href="https://lowerquality.com/gentle">Gentle</a> is free software released under the <a href="https://opensource.org/licenses/MIT">MIT license</a>. <a href="https://lowerquality.com/gentle">Homepage</a> | <a href="https://github.com/lowerquality/gentle">Source code</a>.
+    </div> -->
+
+    <script>
+
+function get(url, cb) {
+    var xhr = new XMLHttpRequest();
+    xhr.open("GET", url, true);
+    xhr.onload = function() {
+        cb(this.responseText);
+    }
+    xhr.send();
+}
+function get_json(url, cb) {
+    get(url, function(x) {
+        cb(JSON.parse(x));
+    });
+}
+
+var $a = document.getElementById("audio");
+window.onkeydown = function(ev) {
+    if(ev.keyCode == 32) {
+        ev.preventDefault();
+        $a.pause();
+    }
+}
+
+var $trans = document.getElementById("transcript");
+var $preloader = document.getElementById('preloader');
+
+var wds = [];
+var cur_wd;
+
+var $phones = document.createElement("div");
+$phones.className = "phones";
+document.body.appendChild($phones);
+
+var cur_phones$ = [];           // List of phoneme $divs
+var $active_phone;
+
+function render_phones(wd) {
+    cur_phones$ = [];
+    $phones.innerHTML = "";
+    $active_phone = null;
+
+    $phones.style.top = wd.$div.offsetTop + 18;
+    $phones.style.left = wd.$div.offsetLeft;
+
+    var dur = wd.end - wd.start;
+
+    var start_x = wd.$div.offsetLeft;
+
+    wd.phones
+        .forEach(function(ph){
+            var $p = document.createElement("span");
+            $p.className = "phone";
+            $p.textContent = ph.phone.split("_")[0];
+
+            $phones.appendChild($p);
+            cur_phones$.push($p);
+        });
+
+    var offsetToCenter = (wd.$div.offsetWidth - $phones.offsetWidth) / 2;
+    $phones.style.left = wd.$div.offsetLeft + offsetToCenter;
+}
+function highlight_phone(t) {
+    if(!cur_wd) {
+        $phones.innerHTML = "";
+        return;
+    }
+    var hit;
+    var cur_t = cur_wd.start;
+
+    cur_wd.phones.forEach(function(ph, idx) {
+        if(cur_t <= t && cur_t + ph.duration >= t) {
+            hit = idx;
+        }
+        cur_t += ph.duration;
+    });
+
+    if(hit) {
+        var $ph = cur_phones$[hit];
+        if($ph != $active_phone) {
+            if($active_phone) {
+                $active_phone.classList.remove("phactive");
+            }
+            if($ph) {
+                $ph.classList.add("phactive");
+            }
+        }
+        $active_phone = $ph;
+    }
+}
+
+function highlight_word() {
+    var t = $a.currentTime;
+    // XXX: O(N); use binary search
+    var hits = wds.filter(function(x) {
+        return (t - x.start) > 0.01 && (x.end - t) > 0.01;
+    }, wds);
+    var next_wd = hits[hits.length - 1];
+
+    if(cur_wd != next_wd) {
+        var active = document.querySelectorAll('.active');
+        for(var i = 0; i < active.length; i++) {
+            active[i].classList.remove('active');
+        }
+        if(next_wd && next_wd.$div) {
+            next_wd.$div.classList.add('active');
+            render_phones(next_wd);
+        }
+    }
+    cur_wd = next_wd;
+    highlight_phone(t);
+
+    window.requestAnimationFrame(highlight_word);
+}
+window.requestAnimationFrame(highlight_word);
+
+$trans.innerHTML = "Loading...";
+
+function render(ret) {
+    wds = ret['words'] || [];
+    transcript = ret['transcript'];
+
+    $trans.innerHTML = '';
+
+    var currentOffset = 0;
+
+    wds.forEach(function(wd) {
+        if(wd.case == 'not-found-in-transcript') {
+            // TODO: show phonemes somewhere
+            var txt = ' ' + wd.word;
+            var $plaintext = document.createTextNode(txt);
+            $trans.appendChild($plaintext);
+            return;
+        }
+
+        // Add non-linked text
+        if(wd.startOffset > currentOffset) {
+            var txt = transcript.slice(currentOffset, wd.startOffset);
+            var $plaintext = document.createTextNode(txt);
+            $trans.appendChild($plaintext);
+            currentOffset = wd.startOffset;
+        }
+
+        var $wd = document.createElement('span');
+        var txt = transcript.slice(wd.startOffset, wd.endOffset);
+        var $wdText = document.createTextNode(txt);
+        $wd.appendChild($wdText);
+        wd.$div = $wd;
+        if(wd.start !== undefined) {
+            $wd.className = 'success';
+        }
+        $wd.onclick = function() {
+            if(wd.start !== undefined) {
+                console.log(wd.start);
+                $a.currentTime = wd.start;
+                $a.play();
+            }
+        };
+        $trans.appendChild($wd);
+        currentOffset = wd.endOffset;
+    });
+
+    var txt = transcript.slice(currentOffset, transcript.length);
+    var $plaintext = document.createTextNode(txt);
+    $trans.appendChild($plaintext);
+    currentOffset = transcript.length;
+}
+
+function show_downloads() {
+    var $d = document.getElementById("downloads");
+    $d.textContent = "Download as: ";
+    var uid = window.location.pathname.split("/")[2];
+    // Name, path, title, inhibit-on-file:///
+    [["CSV", "align.csv", "Word alignment CSV"],
+     ["JSON", "align.json", "JSON word/phoneme alignment data"],
+     ["Zip", "/zip/" + uid + ".zip", "Standalone zipfile", true]]
+        .forEach(function(x) {
+            var $a = document.createElement("a");
+            $a.className = "download";
+            $a.textContent = x[0];
+            $a.href = x[1];
+            $a.title = x[2];
+            if(!x[3] || window.location.protocol != "file:") {
+                $d.appendChild($a);
+            }
+        });
+}
+
+var status_init = false;
+var status_log  = [];		// [ status ]
+var $status_pro;
+
+function render_status(ret) {
+    if(!status_init) {
+	// Clobber the $trans div and use it for status updates
+	$trans.innerHTML = "<h2>transcription in progress</h2>";
+	$trans.className = "status";
+	$status_pro = document.createElement("progress");
+	$status_pro.setAttribute("min", "0");
+	$status_pro.setAttribute("max", "100");
+	$status_pro.value = 0;
+	$trans.appendChild($status_pro);
+
+	status_init = true;
+    }
+    if(ret.status !== "TRANSCRIBING") {
+	if(ret.percent) {
+	    $status_pro.value = (100*ret.percent);
+	}
+    }
+    else if(ret.percent && (status_log.length == 0 || status_log[status_log.length-1].percent+0.0001 < ret.percent)) {
+	// New entry
+	var $entry = document.createElement("div");
+	$entry.className = "entry";
+	$entry.textContent = ret.message;
+	ret.$div = $entry;
+
+	if(ret.percent) {
+	    $status_pro.value = (100*ret.percent);
+	}
+
+	if(status_log.length > 0) {
+	    $trans.insertBefore($entry, status_log[status_log.length-1].$div);
+	}
+	else {
+	    $trans.appendChild($entry);
+	}
+	status_log.push(ret);
+    }
+}
+
+function update() {
+    if(INLINE_JSON) {
+        // We want this to work from file:/// domains, so we provide a
+        // mechanism for inlining the alignment data.
+        render(INLINE_JSON);
+        // show_downloads();
+    }
+    else  {
+	// Show the status
+        get_json('status.json', function(ret) {
+	          $a.style.visibility = 'hidden';
+            if (ret.status == 'ERROR') {
+                $preloader.style.visibility = 'hidden';
+                $trans.innerHTML = '<b>' + ret.status + ': ' + ret.error + '</b>';
+            } else if (ret.status == 'TRANSCRIBING' || ret.status == 'ALIGNING') {
+                $preloader.style.visibility = 'visible';
+                render_status(ret);
+                setTimeout(update, 2000);
+            } else if (ret.status == 'OK') {
+                // show_downloads();
+                $preloader.style.visibility = 'hidden';
+		// XXX: should we fetch the align.json?
+		// window.location.reload();
+                $a.style.visibility = 'visible';
+                render(ret);
+            } else if (ret.status == 'ENCODING' || ret.status == 'STARTED') {
+                $preloader.style.visibility = 'visible';
+                $trans.innerHTML = 'Encoding, please wait...';
+                setTimeout(update, 2000);
+            } else {
+		console.log("unknown status", ret);
+                $preloader.style.visibility = 'hidden';
+                $trans.innerHTML = ret.status + '...';
+                setTimeout(update, 5000);
+            }
+        });
+    }
+}
+
+var INLINE_JSON;
+
+update();
+
+</script></body></html>
--- a/plume/utils/gentle_preview/preloader.gif
+++ b/plume/utils/gentle_preview/preloader.gif
--- a/plume/utils/st_rerun.py
+++ b/plume/utils/st_rerun.py
--- a/plume/utils/transcribe.py
+++ b/plume/utils/transcribe.py
@@ -8,12 +8,11 @@ import typer
 # import rpyc

 # from tqdm import tqdm
-# from pydub import AudioSegment
 # from pydub.silence import split_on_silence
 from plume.utils import lazy_module, lazy_callable

 rpyc = lazy_module('rpyc')
-AudioSegment = lazy_callable('pydub.AudioSegment')
+pydub = lazy_module('pydub')
 split_on_silence = lazy_callable('pydub.silence.split_on_silence')

 app = typer.Typer()
@@ -106,7 +105,7 @@ def triton_transcribe_grpc_gen(
        #     ]
        #     pass
        transcript_list = []
-        sil_pad = AudioSegment.silent(duration=sil_msec)
+        sil_pad = pydub.AudioSegment.silent(duration=sil_msec)
        for seg in chunks:
            t_seg = sil_pad + seg + sil_pad
            c_transcript = transcriber(t_seg)
@@ -124,9 +123,7 @@ def triton_transcribe_grpc_gen(

@app.command()
 def file(audio_file: Path, write_file: bool = False, chunked=True):
-    from pydub import AudioSegment
-
-    aseg = AudioSegment.from_file(audio_file)
+    aseg = pydub.AudioSegment.from_file(audio_file)
    transcriber, prep = triton_transcribe_grpc_gen()
    transcription = transcriber(prep(aseg))

@@ -139,10 +136,8 @@ def file(audio_file: Path, write_file: bool = False, chunked=True):

@app.command()
 def benchmark(audio_file: Path):
-    from pydub import AudioSegment
-
    transcriber, audio_prep = transcribe_rpyc_gen()
-    file_seg = AudioSegment.from_file(audio_file)
+    file_seg = pydub.AudioSegment.from_file(audio_file)
    aud_seg = audio_prep(file_seg)

    def timeinfo():
--- a/plume/utils/tts.py
+++ b/plume/utils/tts.py
@@ -27,6 +27,10 @@ class GoogleTTS(object):
            audio_encoding=texttospeech.enums.AudioEncoding.LINEAR16,
            sample_rate_hertz=params["sample_rate"],
        )
+        if 'speaking_rate' in params:
+            audio_config.speaking_rate = params['speaking_rate']
+        if 'pitch' in params:
+            audio_config.pitch = params['pitch']
        response = self.client.synthesize_speech(tts_input, voice, audio_config)
        audio_content = response.audio_content
        return audio_content
@@ -74,6 +78,19 @@ class GoogleTTS(object):
            )
        return results

+    @classmethod
+    def voice_by_name(cls, name):
+        """Lists the available voices."""
+
+        # client = cls().client
+
+        # Performs the list voices request
+        results = cls.voice_list()
+        for voice in results:
+            if voice['name'] == name:
+                return voice
+        raise ValueError(f'{name} not a valid voice')
+

@app.command()
 def generate_audio_file(text, dest_path: Path = "./tts_audio.wav", voice="en-US-Wavenet-D"):
--- a/plume/utils/ui_persist.py
+++ b/plume/utils/ui_persist.py
@@ -0,0 +1,85 @@
+from plume.utils import ExtendedPath, get_mongo_conn
+from plume.utils.st_rerun import rerun
+from uuid import uuid4
+from pathlib import Path
+
+
+def setup_file_state(st):
+    if not hasattr(st, "state_lock"):
+        # st.task_id = str(uuid4())
+        task_path = ExtendedPath("preview.lck")
+
+        def current_cursor_fn():
+            return task_path.read_json()["current_cursor"]
+
+        def update_cursor_fn(val=0):
+            task_path.write_json({"current_cursor": val})
+            rerun()
+
+        st.get_current_cursor = current_cursor_fn
+        st.update_cursor = update_cursor_fn
+        st.state_lock = True
+        # cursor_obj = mongo_conn.find_one({"type": "current_cursor", "task_id": st.task_id})
+        # if not cursor_obj:
+        update_cursor_fn(0)
+
+
+def setup_mongo_asr_validation_state(st):
+    if not hasattr(st, "mongo_connected"):
+        st.mongoclient = get_mongo_conn(col="asr_validation")
+        mongo_conn = st.mongoclient
+        st.task_id = str(uuid4())
+
+        def current_cursor_fn():
+            # mongo_conn = st.mongoclient
+            cursor_obj = mongo_conn.find_one(
+                {"type": "current_cursor", "task_id": st.task_id}
+            )
+            cursor_val = cursor_obj["cursor"]
+            return cursor_val
+
+        def update_cursor_fn(val=0):
+            mongo_conn.find_one_and_update(
+                {"type": "current_cursor", "task_id": st.task_id},
+                {
+                    "$set": {
+                        "type": "current_cursor",
+                        "task_id": st.task_id,
+                        "cursor": val,
+                    }
+                },
+                upsert=True,
+            )
+            rerun()
+
+        def get_correction_entry_fn(code):
+            return mongo_conn.find_one(
+                {"type": "correction", "code": code}, projection={"_id": False}
+            )
+
+        def update_entry_fn(code, value):
+            mongo_conn.find_one_and_update(
+                {"type": "correction", "code": code},
+                {"$set": {"value": value, "task_id": st.task_id}},
+                upsert=True,
+            )
+
+        def set_task_fn(data_path, task_id):
+            if task_id:
+                st.task_id = task_id
+            task_path = data_path / Path(f"task-{st.task_id}.lck")
+            if not task_path.exists():
+                print(f"creating task lock at {task_path}")
+                task_path.touch()
+
+        st.get_current_cursor = current_cursor_fn
+        st.update_cursor = update_cursor_fn
+        st.get_correction_entry = get_correction_entry_fn
+        st.update_entry = update_entry_fn
+        st.set_task = set_task_fn
+        st.mongo_connected = True
+        cursor_obj = mongo_conn.find_one(
+            {"type": "current_cursor", "task_id": st.task_id}
+        )
+        if not cursor_obj:
+            update_cursor_fn(0)
--- a/plume/utils/vad.py
+++ b/plume/utils/vad.py
@@ -0,0 +1,205 @@
+import logging
+import asyncio
+import argparse
+from pathlib import Path
+
+import webrtcvad
+import pydub
+from pydub.playback import play
+from pydub.utils import make_chunks
+
+
+DEFAULT_CHUNK_DUR = 20
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+
+def is_frame_voice(vad, seg, chunk_dur):
+    return (
+        True
+        if (
+            seg.duration_seconds == chunk_dur / 1000
+            and vad.is_speech(seg.raw_data, seg.frame_rate)
+        )
+        else False
+    )
+
+
+class VADFilterAudio(object):
+    """docstring for VADFilterAudio."""
+
+    def __init__(self, chunk_dur=DEFAULT_CHUNK_DUR):
+        super(VADFilterAudio, self).__init__()
+        self.chunk_dur = chunk_dur
+        self.vad = webrtcvad.Vad()
+
+    def filter_segment(self, wav_seg):
+        chunks = make_chunks(wav_seg, self.chunk_dur)
+        speech_buffer = b""
+
+        for i, c in enumerate(chunks[:-1]):
+            voice_frame = is_frame_voice(self.vad, c, self.chunk_dur)
+            if voice_frame:
+                speech_buffer += c.raw_data
+        filtered_seg = pydub.AudioSegment(
+            data=speech_buffer,
+            frame_rate=wav_seg.frame_rate,
+            channels=wav_seg.channels,
+            sample_width=wav_seg.sample_width,
+        )
+        return filtered_seg
+
+
+class VADUtterance(object):
+    """docstring for VADUtterance."""
+
+    def __init__(
+        self,
+        max_silence=500,
+        min_utterance=280,
+        max_utterance=20000,
+        chunk_dur=DEFAULT_CHUNK_DUR,
+        start_cycles=3,
+    ):
+        super(VADUtterance, self).__init__()
+        self.vad = webrtcvad.Vad()
+        self.chunk_dur = chunk_dur
+        # duration in millisecs
+        self.max_sil = max_silence
+        self.min_utt = min_utterance
+        self.max_utt = max_utterance
+        self.speech_start = start_cycles * chunk_dur
+
+    def __repr__(self):
+        return f"VAD(max_silence={self.max_sil},min_utterance:{self.min_utt},max_utterance:{self.max_utt})"
+
+    async def stream_utterance(self, audio_stream):
+        silence_buffer = pydub.AudioSegment.empty()
+        voice_buffer = pydub.AudioSegment.empty()
+        silence_threshold = False
+        async for c in audio_stream:
+            voice_frame = is_frame_voice(self.vad, c, self.chunk_dur)
+            logger.debug(f"is audio stream voice? {voice_frame}")
+            if voice_frame:
+                silence_threshold = False
+                voice_buffer += c
+                silence_buffer = pydub.AudioSegment.empty()
+            else:
+                silence_buffer += c
+            voc_dur = voice_buffer.duration_seconds * 1000
+            sil_dur = silence_buffer.duration_seconds * 1000
+
+            if voc_dur >= self.max_utt:
+                logger.info(
+                    f"detected voice overflow: voice duration {voice_buffer.duration_seconds}"
+                )
+                yield voice_buffer
+                voice_buffer = pydub.AudioSegment.empty()
+
+            if sil_dur >= self.max_sil:
+                if voc_dur >= self.min_utt:
+                    logger.info(
+                        f"detected silence: voice duration {voice_buffer.duration_seconds}"
+                    )
+                    yield voice_buffer
+                voice_buffer = pydub.AudioSegment.empty()
+                # ignore/clear voice if silence reached threshold or indent the statement
+                if not silence_threshold:
+                    silence_threshold = True
+
+        if voice_buffer:
+            yield voice_buffer
+
+    async def stream_events(self, audio_stream):
+        """
+        yields 0, voice_buffer for SpeechBuffer
+        yields 1, None for StartedSpeaking
+        yields 2, None for StoppedSpeaking
+        yields 4, audio_stream
+        """
+        silence_buffer = pydub.AudioSegment.empty()
+        voice_buffer = pydub.AudioSegment.empty()
+        silence_threshold, started_speaking = False, False
+        async for c in audio_stream:
+            # yield (4, c)
+            voice_frame = is_frame_voice(self.vad, c, self.chunk_dur)
+            logger.debug(f"is audio stream voice? {voice_frame}")
+            if voice_frame:
+                silence_threshold = False
+                voice_buffer += c
+                silence_buffer = pydub.AudioSegment.empty()
+            else:
+                silence_buffer += c
+            voc_dur = voice_buffer.duration_seconds * 1000
+            sil_dur = silence_buffer.duration_seconds * 1000
+
+            if voc_dur >= self.speech_start and not started_speaking:
+                started_speaking = True
+                yield (1, None)
+
+            if voc_dur >= self.max_utt:
+                logger.info(
+                    f"detected voice overflow: voice duration {voice_buffer.duration_seconds}"
+                )
+                yield (0, voice_buffer)
+                voice_buffer = pydub.AudioSegment.empty()
+                started_speaking = False
+
+            if sil_dur >= self.max_sil:
+                if voc_dur >= self.min_utt:
+                    logger.info(
+                        f"detected silence: voice duration {voice_buffer.duration_seconds}"
+                    )
+                    yield (0, voice_buffer)
+                voice_buffer = pydub.AudioSegment.empty()
+                started_speaking = False
+                # ignore/clear voice if silence reached threshold or indent the statement
+                if not silence_threshold:
+                    silence_threshold = True
+                    yield (2, None)
+
+        if voice_buffer:
+            yield (0, voice_buffer)
+
+    @classmethod
+    async def stream_utterance_file(cls, audio_file):
+        async def stream_gen():
+            audio_seg = pydub.AudioSegment.from_file(audio_file).set_frame_rate(32000)
+            chunks = make_chunks(audio_seg, DEFAULT_CHUNK_DUR)
+            for c in chunks:
+                yield c
+
+        va_ut = cls()
+        buffer_src = va_ut.stream_utterance(stream_gen())
+        async for buf in buffer_src:
+            play(buf)
+            await asyncio.sleep(1)
+
+
+class VADStreamGen(object):
+    """docstring for VADStreamGen."""
+
+    def __init__(self, arg):
+        super(VADStreamGen, self).__init__()
+        self.arg = arg
+
+
+def main():
+    prog = Path(__file__).stem
+    parser = argparse.ArgumentParser(prog=prog, description="transcribes audio file")
+    parser.add_argument(
+        "--audio_file",
+        type=argparse.FileType("rb"),
+        help="audio file to transcribe",
+        default="./test_utter2.wav",
+    )
+    args = parser.parse_args()
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(VADUtterance.stream_utterance_file(args.audio_file))
+
+
+if __name__ == "__main__":
+    main()
--- a/setup.py
+++ b/setup.py
@@ -58,6 +58,9 @@ extra_requirements = {
        "stringcase~=1.2.0",
        "google-cloud-speech~=1.3.1",
    ],
+    "ui": [
+        "rangehttpserver~=1.2.0",
+    ],
    "train": ["torchaudio~=0.6.0", "torch-stft~=0.1.4"],
 }