1. Self contained typers

2. Asr force-aligner visualization
3. streamlit state management abstraction
4. new utils / reorganize
5. added verbose flags
6. add tts by name
tegra
Malar 2021-03-23 13:27:35 +05:30
parent f72c6bbe5b
commit c474aa5f5a
22 changed files with 1097 additions and 146 deletions

1
MANIFEST.in Normal file
View File

@ -0,0 +1 @@
graft plume/utils/gentle_preview

View File

@ -7,12 +7,12 @@ from .eval import app as eval_app
from .serve import app as serve_app
app = typer.Typer()
app.add_typer(data_app, name="data")
app.add_typer(ui_app, name="ui")
app.add_typer(train_app, name="train")
app.add_typer(eval_app, name="eval")
app.add_typer(serve_app, name="serve")
app.add_typer(utils_app, name='utils')
app.add_typer(data_app)
app.add_typer(ui_app)
app.add_typer(train_app)
app.add_typer(eval_app)
app.add_typer(serve_app)
app.add_typer(utils_app)
def main():

View File

@ -27,6 +27,13 @@ app.add_typer(generate_app, name="generate")
app.add_typer(wav2vec2_app, name="wav2vec2")
@app.callback()
def data():
"""
data sub commands
"""
@app.command()
def fix_path(dataset_path: Path, force: bool = False):
manifest_path = dataset_path / Path("manifest.json")

View File

@ -3,3 +3,10 @@ from ..models.wav2vec2.eval import app as wav2vec2_app
app = typer.Typer()
app.add_typer(wav2vec2_app, name="wav2vec2")
@app.callback()
def eval():
"""
eval sub commands
"""

View File

@ -5,3 +5,10 @@ from ..models.jasper.serve import app as jasper_app
app = typer.Typer()
app.add_typer(wav2vec2_app, name="wav2vec2")
app.add_typer(jasper_app, name="jasper")
@app.callback()
def serve():
"""
serve sub commands
"""

View File

@ -1,5 +1,12 @@
import typer
from ..models.wav2vec2.train import app as train_app
from ..models.wav2vec2.train import app as wav2vec2_app
app = typer.Typer()
app.add_typer(train_app, name="wav2vec2")
app.add_typer(wav2vec2_app, name="wav2vec2")
@app.callback()
def train():
"""
train sub commands
"""

View File

@ -3,12 +3,20 @@ import sys
from pathlib import Path
from plume.utils import lazy_module
# from streamlit import cli as stcli
stcli = lazy_module('streamlit.cli')
stcli = lazy_module("streamlit.cli")
app = typer.Typer()
@app.callback()
def ui():
"""
ui sub commands
"""
@app.command()
def annotation(data_dir: Path, dump_fname: Path = "ui_dump.json", task_id: str = ""):
annotation_lit_path = Path(__file__).parent / Path("annotation.py")
@ -40,13 +48,7 @@ def annotation(data_dir: Path, dump_fname: Path = "ui_dump.json", task_id: str =
@app.command()
def preview(manifest_path: Path):
annotation_lit_path = Path(__file__).parent / Path("preview.py")
sys.argv = [
"streamlit",
"run",
str(annotation_lit_path),
"--",
str(manifest_path)
]
sys.argv = ["streamlit", "run", str(annotation_lit_path), "--", str(manifest_path)]
sys.exit(stcli.main())
@ -56,6 +58,18 @@ def collection(data_dir: Path, task_id: str = ""):
pass
@app.command()
def alignment(preview_dir: Path, port: int = 8010):
from RangeHTTPServer import RangeRequestHandler
from functools import partial
from http.server import HTTPServer
server_address = ("", port)
handler_class = partial(RangeRequestHandler, directory=str(preview_dir))
httpd = HTTPServer(server_address, handler_class)
httpd.serve_forever()
def main():
app()

View File

@ -1,66 +1,14 @@
# import sys
from pathlib import Path
from uuid import uuid4
import streamlit as st
import typer
from plume.utils import ExtendedPath, get_mongo_conn
from plume.preview.st_rerun import rerun
from plume.utils import ExtendedPath
from plume.utils.ui_persist import setup_mongo_asr_validation_state
app = typer.Typer()
if not hasattr(st, "mongo_connected"):
st.mongoclient = get_mongo_conn(col="asr_validation")
mongo_conn = st.mongoclient
st.task_id = str(uuid4())
def current_cursor_fn():
# mongo_conn = st.mongoclient
cursor_obj = mongo_conn.find_one(
{"type": "current_cursor", "task_id": st.task_id}
)
cursor_val = cursor_obj["cursor"]
return cursor_val
def update_cursor_fn(val=0):
mongo_conn.find_one_and_update(
{"type": "current_cursor", "task_id": st.task_id},
{"$set": {"type": "current_cursor", "task_id": st.task_id, "cursor": val}},
upsert=True,
)
rerun()
def get_correction_entry_fn(code):
return mongo_conn.find_one(
{"type": "correction", "code": code}, projection={"_id": False}
)
def update_entry_fn(code, value):
mongo_conn.find_one_and_update(
{"type": "correction", "code": code},
{"$set": {"value": value, "task_id": st.task_id}},
upsert=True,
)
def set_task_fn(data_path, task_id):
if task_id:
st.task_id = task_id
task_path = data_path / Path(f"task-{st.task_id}.lck")
if not task_path.exists():
print(f"creating task lock at {task_path}")
task_path.touch()
st.get_current_cursor = current_cursor_fn
st.update_cursor = update_cursor_fn
st.get_correction_entry = get_correction_entry_fn
st.update_entry = update_entry_fn
st.set_task = set_task_fn
st.mongo_connected = True
cursor_obj = mongo_conn.find_one({"type": "current_cursor", "task_id": st.task_id})
if not cursor_obj:
update_cursor_fn(0)
setup_mongo_asr_validation_state(st)
@st.cache()

View File

@ -3,27 +3,11 @@ from pathlib import Path
import streamlit as st
import typer
from plume.utils import ExtendedPath
from plume.preview.st_rerun import rerun
from plume.utils.ui_persist import setup_file_state
app = typer.Typer()
if not hasattr(st, "state_lock"):
# st.task_id = str(uuid4())
task_path = ExtendedPath("preview.lck")
def current_cursor_fn():
return task_path.read_json()["current_cursor"]
def update_cursor_fn(val=0):
task_path.write_json({"current_cursor": val})
rerun()
st.get_current_cursor = current_cursor_fn
st.update_cursor = update_cursor_fn
st.state_lock = True
# cursor_obj = mongo_conn.find_one({"type": "current_cursor", "task_id": st.task_id})
# if not cursor_obj:
update_cursor_fn(0)
setup_file_state(st)
@st.cache()
@ -40,7 +24,7 @@ def main(manifest: Path):
print("Invalid samplno resetting to 0")
st.update_cursor(0)
sample = asr_data[sample_no]
st.title(f"ASR Manifest Preview")
st.title("ASR Manifest Preview")
st.markdown(f"{sample_no+1} of {len(asr_data)} : **{sample['text']}**")
new_sample = st.number_input(
"Go To Sample:", value=sample_no + 1, min_value=1, max_value=len(asr_data)

151
plume/utils/.gitignore vendored Normal file
View File

@ -0,0 +1,151 @@
/data/
/model/
/train/
.env*
*.yaml
*.yml
*.json
# Created by https://www.gitignore.io/api/python
# Edit at https://www.gitignore.io/?templates=python
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# Mr Developer
.mr.developer.cfg
.project
.pydevproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# End of https://www.gitignore.io/api/python
# Created by https://www.gitignore.io/api/macos
# Edit at https://www.gitignore.io/?templates=macos
### macOS ###
# General
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
# End of https://www.gitignore.io/api/macos

View File

@ -11,12 +11,14 @@ from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import subprocess
import shutil
from urllib.parse import urlsplit
# from .lazy_loader import LazyLoader
from .lazy_import import lazy_callable, lazy_module
# from ruamel.yaml import YAML
# import boto3
import typer
# import pymongo
# from slugify import slugify
# import pydub
@ -34,16 +36,16 @@ from .tts import app as tts_app
from .transcribe import app as transcribe_app
from .align import app as align_app
boto3 = lazy_module('boto3')
pymongo = lazy_module('pymongo')
pydub = lazy_module('pydub')
audio_display = lazy_module('librosa.display')
plt = lazy_module('matplotlib.pyplot')
librosa = lazy_module('librosa')
YAML = lazy_callable('ruamel.yaml.YAML')
num2words = lazy_callable('num2words.num2words')
slugify = lazy_callable('slugify.slugify')
compress = lazy_callable('natural.date.compress')
boto3 = lazy_module("boto3")
pymongo = lazy_module("pymongo")
pydub = lazy_module("pydub")
audio_display = lazy_module("librosa.display")
plt = lazy_module("matplotlib.pyplot")
librosa = lazy_module("librosa")
YAML = lazy_callable("ruamel.yaml.YAML")
num2words = lazy_callable("num2words.num2words")
slugify = lazy_callable("slugify.slugify")
compress = lazy_callable("natural.date.compress")
app = typer.Typer()
app.add_typer(tts_app, name="tts")
@ -51,6 +53,13 @@ app.add_typer(align_app, name="align")
app.add_typer(transcribe_app, name="transcribe")
@app.callback()
def utils():
"""
utils sub commands
"""
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
@ -125,6 +134,10 @@ def upload_s3(dataset_path, s3_path):
run_shell(f"aws s3 sync {dataset_path} {s3_path}")
def copy_s3(dataset_path, s3_path):
run_shell(f"aws s3 cp {dataset_path} {s3_path}")
def get_download_path(s3_uri, output_path):
s3_uri_p = urlsplit(s3_uri)
download_path = output_path / Path(s3_uri_p.path[1:])
@ -135,11 +148,12 @@ def get_download_path(s3_uri, output_path):
def s3_downloader():
s3 = boto3.client("s3")
def download_s3(s3_uri, download_path):
def download_s3(s3_uri, download_path, verbose=False):
s3_uri_p = urlsplit(s3_uri)
download_path.parent.mkdir(exist_ok=True, parents=True)
if not download_path.exists():
print(f"downloading {s3_uri} to {download_path}")
if verbose:
print(f"downloading {s3_uri} to {download_path}")
s3.download_file(s3_uri_p.netloc, s3_uri_p.path[1:], str(download_path))
return download_s3
@ -186,6 +200,7 @@ def ui_data_generator(dataset_dir, asr_data_source, verbose=False):
plot_seg(wav_plot_path.absolute(), audio_file)
return {
"audio_path": str(rel_data_path),
"audio_filepath": str(rel_data_path),
"duration": round(audio_dur, 1),
"text": transcript,
"real_idx": num_datapoints,
@ -229,17 +244,17 @@ def ui_dump_manifest_writer(dataset_dir, asr_data_source, verbose=False):
)
asr_manifest = dataset_dir / Path("manifest.json")
with asr_manifest.open("w") as mf:
print(f"writing manifest to {asr_manifest}")
for d in dump_data:
rel_data_path = d["audio_path"]
audio_dur = d["duration"]
transcript = d["text"]
manifest = manifest_str(str(rel_data_path), audio_dur, transcript)
mf.write(manifest)
asr_manifest_writer(asr_manifest, dump_data, verbose=verbose)
# with asr_manifest.open("w") as mf:
# print(f"writing manifest to {asr_manifest}")
# for d in dump_data:
# rel_data_path = d["audio_path"]
# audio_dur = d["duration"]
# transcript = d["text"]
# manifest = manifest_str(str(rel_data_path), audio_dur, transcript)
# mf.write(manifest)
ui_dump_file = dataset_dir / Path("ui_dump.json")
ExtendedPath(ui_dump_file).write_json({"data": dump_data})
ExtendedPath(ui_dump_file).write_json({"data": dump_data}, verbose=verbose)
return num_datapoints
@ -254,9 +269,10 @@ def asr_manifest_reader(data_manifest_path: Path):
yield p
def asr_manifest_writer(asr_manifest_path: Path, manifest_str_source):
def asr_manifest_writer(asr_manifest_path: Path, manifest_str_source, verbose=False):
with asr_manifest_path.open("w") as mf:
print(f"opening {asr_manifest_path} for writing manifest")
if verbose:
print(f"writing asr manifest to {asr_manifest_path}")
for mani_dict in manifest_str_source:
manifest = manifest_str(
mani_dict["audio_filepath"], mani_dict["duration"], mani_dict["text"]
@ -293,37 +309,43 @@ def batch(iterable, n=1):
class ExtendedPath(type(Path())):
"""docstring for ExtendedPath."""
def read_json(self):
print(f"reading json from {self}")
def read_json(self, verbose=False):
if verbose:
print(f"reading json from {self}")
with self.open("r") as jf:
return json.load(jf)
def read_yaml(self):
def read_yaml(self, verbose=False):
yaml = YAML(typ="safe", pure=True)
print(f"reading yaml from {self}")
if verbose:
print(f"reading yaml from {self}")
with self.open("r") as yf:
return yaml.load(yf)
def read_jsonl(self):
print(f"reading jsonl from {self}")
def read_jsonl(self, verbose=False):
if verbose:
print(f"reading jsonl from {self}")
with self.open("r") as jf:
for l in jf.readlines():
yield json.loads(l)
for ln in jf.readlines():
yield json.loads(ln)
def write_json(self, data):
print(f"writing json to {self}")
def write_json(self, data, verbose=False):
if verbose:
print(f"writing json to {self}")
self.parent.mkdir(parents=True, exist_ok=True)
with self.open("w") as jf:
json.dump(data, jf, indent=2)
def write_yaml(self, data):
def write_yaml(self, data, verbose=False):
yaml = YAML()
print(f"writing yaml to {self}")
if verbose:
print(f"writing yaml to {self}")
with self.open("w") as yf:
yaml.dump(data, yf)
def write_jsonl(self, data):
print(f"writing jsonl to {self}")
def write_jsonl(self, data, verbose=False):
if verbose:
print(f"writing jsonl to {self}")
self.parent.mkdir(parents=True, exist_ok=True)
with self.open("w") as jf:
for d in data:

View File

@ -1,12 +1,14 @@
from pathlib import Path
from .tts import GoogleTTS
# from IPython import display
import requests
import io
import typer
import shutil
import typer
from plume.utils import lazy_module
from .tts import GoogleTTS
display = lazy_module('IPython.display')
pydub = lazy_module('pydub')
@ -63,16 +65,19 @@ def gentle_preview(
audio_path: Path,
transcript_path: Path,
service_uri="http://101.53.142.218:8765/transcriptions",
gent_preview_dir="../gentle_preview",
gent_preview_dir="./gentle_preview",
):
from . import ExtendedPath
ab = audio_path.read_bytes()
tt = transcript_path.read_text()
audio, alignment = gentle_aligner(service_uri, ab, tt)
audio.export(gent_preview_dir / Path("a.wav"), format="wav")
alignment["status"] = "OK"
ExtendedPath(gent_preview_dir / Path("status.json")).write_json(alignment)
pkg_gentle_dir = Path(__file__).parent / 'gentle_preview'
shutil.copytree(str(pkg_gentle_dir), str(gent_preview_dir))
# ab = audio_path.read_bytes()
# tt = transcript_path.read_text()
# audio, alignment = gentle_aligner(service_uri, ab, tt)
# audio.export(gent_preview_dir / Path("a.wav"), format="wav")
# alignment["status"] = "OK"
# ExtendedPath(gent_preview_dir / Path("status.json")).write_json(alignment)
def main():

View File

@ -0,0 +1,5 @@
Serve with https://github.com/danvk/RangeHTTPServer
`https://github.com/claysciences/CORSRangeHTTPServer`
`python -m RangeHTTPServer`
`python -m http.server`

View File

@ -0,0 +1,80 @@
<html>
<head>
<meta charset="utf-8" />
<style>
body {font-family: sans-serif; padding-top: 70px; }
textarea { width: 500px; height: 20em; }
input, textarea { margin: 1em 0; }
#header {
position: fixed;
top: 0;
left: 0;
height: 50px;
line-height: 50px;
width: 100%;
background-color: #999;
box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5);
font-family: Helvetica, sans-serif;
}
#header, #header a {
color: white;
}
.home {
margin: 0;
font-size: 125%;
font-weight: lighter;
text-transform: lowercase;
}
.home a {
margin: 0;
background: #666;
padding-left: 25px;
padding-right: 30px;
margin-right: 20px;
float: left;
text-decoration: none;
}
.home:hover a {
background: #555;
}
#align-button {
background: #CCC;
border: 0;
font-size: 18px;
padding: 10px 30px;
cursor: pointer;
}
#alignment-flags {
background: #CCC;
border: 0;
font-size: 18px;
padding: 10px 30px;
}
#footer {
margin-top: 100px;
border-top: 1px dotted black;
font-size: 8pt;
font-style: italic;
padding: 10px;
}
</style>
</head>
<body>
<div id="header">
<h1 class="home"><a href="/">Gentle</a></h1>
</div>
<form action="/transcriptions" method="POST" enctype="multipart/form-data">
Audio:<br>
<input type=file name=audio><br>
<br>
Transcript:<br>
<textarea name="transcript"></textarea><br>
<input id=alignment-flags name=conservative type=checkbox> Conservative<br>
<input id=alignment-flags name=disfluency type=checkbox> Include disfluencies<br>
<input id="align-button" type=submit value=Align>
</form>
<div id="footer">
<a href="https://lowerquality.com/gentle">Gentle</a> is free software released under the <a href="https://opensource.org/licenses/MIT">MIT license</a>. <a href="https://lowerquality.com/gentle">Homepage</a> | <a href="https://github.com/lowerquality/gentle">Source code</a>.
</div>
</body>
</html>

View File

@ -0,0 +1,408 @@
<html>
<head>
<meta charset="utf-8" />
<style>
html, body {
margin: 0;
padding: 0;
}
#header {
position: fixed;
top: 0;
left: 0;
height: 50px;
line-height: 50px;
width: 100%;
background-color: #999;
box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5);
font-family: Helvetica, sans-serif;
}
#header, #header a {
color: white;
}
#downloads {
float: right;
background: #999;
}
.download {
float: right;
background: #999;
padding: 0 5px;
}
.home {
margin: 0;
font-size: 125%;
font-weight: lighter;
text-transform: lowercase;
}
.home a {
margin: 0;
background: #666;
padding-left: 25px;
padding-right: 30px;
margin-right: 20px;
float: left;
text-decoration: none;
}
.home:hover a {
background: #555;
}
#audio {
margin-top: 9px;
width: 50%;
display: inline-block;
}
#transcript {
margin: 0 15px;
margin-top: 70px;
margin-bottom: 5em;
white-space: pre-wrap;
line-height: 2em;
max-width: 600px;
color: #999;
}
#transcript.status {
background-color: #333;
color: #fff;
font-family: Courier, mono;
line-height: 1em;
font-size: 10pt;
max-width: 100%;
}
#transcript.status h2 {
padding: 10px;
}
#transcript.status .entry {
margin-bottom: 10px;
padding: 10px;
}
#transcript.status progress {
width: 100%;
height: 30px;
margin-bottom: 20px;
}
.success {
color: black;
}
.success:hover {
text-decoration: underline;
}
.active {
color: magenta;
}
#preloader {
visibility: hidden;
}
.phactive {
text-decoration: underline;
}
.phones {
position: absolute;
color: #333;
}
.phones .phone {
margin-right: 5px;
font-family: Helvetica, sans-serif;
text-transform: uppercase;
font-size: 50%;
}
.phones .phone:last-child {
margin-right: 0;
}
#footer {
margin-top: 100px;
border-top: 1px dotted black;
font-size: 8pt;
font-style: italic;
font-family: Helvetica, sans-serif;
padding: 10px;
}
</style>
</head>
<body>
<div id="header">
<!-- <h1 class="home"><a href="/">Gentle</a></h1> -->
<audio id="audio" src="a.wav" controls="true" preload="auto"></audio>
<img src="/preloader.gif" id="preloader" alt="loading...">
<span id="downloads"> </div>
</div>
<div id="transcript"></div>
<!-- <div id="footer">
<a href="https://lowerquality.com/gentle">Gentle</a> is free software released under the <a href="https://opensource.org/licenses/MIT">MIT license</a>. <a href="https://lowerquality.com/gentle">Homepage</a> | <a href="https://github.com/lowerquality/gentle">Source code</a>.
</div> -->
<script>
function get(url, cb) {
var xhr = new XMLHttpRequest();
xhr.open("GET", url, true);
xhr.onload = function() {
cb(this.responseText);
}
xhr.send();
}
function get_json(url, cb) {
get(url, function(x) {
cb(JSON.parse(x));
});
}
var $a = document.getElementById("audio");
window.onkeydown = function(ev) {
if(ev.keyCode == 32) {
ev.preventDefault();
$a.pause();
}
}
var $trans = document.getElementById("transcript");
var $preloader = document.getElementById('preloader');
var wds = [];
var cur_wd;
var $phones = document.createElement("div");
$phones.className = "phones";
document.body.appendChild($phones);
var cur_phones$ = []; // List of phoneme $divs
var $active_phone;
function render_phones(wd) {
cur_phones$ = [];
$phones.innerHTML = "";
$active_phone = null;
$phones.style.top = wd.$div.offsetTop + 18;
$phones.style.left = wd.$div.offsetLeft;
var dur = wd.end - wd.start;
var start_x = wd.$div.offsetLeft;
wd.phones
.forEach(function(ph){
var $p = document.createElement("span");
$p.className = "phone";
$p.textContent = ph.phone.split("_")[0];
$phones.appendChild($p);
cur_phones$.push($p);
});
var offsetToCenter = (wd.$div.offsetWidth - $phones.offsetWidth) / 2;
$phones.style.left = wd.$div.offsetLeft + offsetToCenter;
}
function highlight_phone(t) {
if(!cur_wd) {
$phones.innerHTML = "";
return;
}
var hit;
var cur_t = cur_wd.start;
cur_wd.phones.forEach(function(ph, idx) {
if(cur_t <= t && cur_t + ph.duration >= t) {
hit = idx;
}
cur_t += ph.duration;
});
if(hit) {
var $ph = cur_phones$[hit];
if($ph != $active_phone) {
if($active_phone) {
$active_phone.classList.remove("phactive");
}
if($ph) {
$ph.classList.add("phactive");
}
}
$active_phone = $ph;
}
}
function highlight_word() {
var t = $a.currentTime;
// XXX: O(N); use binary search
var hits = wds.filter(function(x) {
return (t - x.start) > 0.01 && (x.end - t) > 0.01;
}, wds);
var next_wd = hits[hits.length - 1];
if(cur_wd != next_wd) {
var active = document.querySelectorAll('.active');
for(var i = 0; i < active.length; i++) {
active[i].classList.remove('active');
}
if(next_wd && next_wd.$div) {
next_wd.$div.classList.add('active');
render_phones(next_wd);
}
}
cur_wd = next_wd;
highlight_phone(t);
window.requestAnimationFrame(highlight_word);
}
window.requestAnimationFrame(highlight_word);
$trans.innerHTML = "Loading...";
function render(ret) {
wds = ret['words'] || [];
transcript = ret['transcript'];
$trans.innerHTML = '';
var currentOffset = 0;
wds.forEach(function(wd) {
if(wd.case == 'not-found-in-transcript') {
// TODO: show phonemes somewhere
var txt = ' ' + wd.word;
var $plaintext = document.createTextNode(txt);
$trans.appendChild($plaintext);
return;
}
// Add non-linked text
if(wd.startOffset > currentOffset) {
var txt = transcript.slice(currentOffset, wd.startOffset);
var $plaintext = document.createTextNode(txt);
$trans.appendChild($plaintext);
currentOffset = wd.startOffset;
}
var $wd = document.createElement('span');
var txt = transcript.slice(wd.startOffset, wd.endOffset);
var $wdText = document.createTextNode(txt);
$wd.appendChild($wdText);
wd.$div = $wd;
if(wd.start !== undefined) {
$wd.className = 'success';
}
$wd.onclick = function() {
if(wd.start !== undefined) {
console.log(wd.start);
$a.currentTime = wd.start;
$a.play();
}
};
$trans.appendChild($wd);
currentOffset = wd.endOffset;
});
var txt = transcript.slice(currentOffset, transcript.length);
var $plaintext = document.createTextNode(txt);
$trans.appendChild($plaintext);
currentOffset = transcript.length;
}
function show_downloads() {
var $d = document.getElementById("downloads");
$d.textContent = "Download as: ";
var uid = window.location.pathname.split("/")[2];
// Name, path, title, inhibit-on-file:///
[["CSV", "align.csv", "Word alignment CSV"],
["JSON", "align.json", "JSON word/phoneme alignment data"],
["Zip", "/zip/" + uid + ".zip", "Standalone zipfile", true]]
.forEach(function(x) {
var $a = document.createElement("a");
$a.className = "download";
$a.textContent = x[0];
$a.href = x[1];
$a.title = x[2];
if(!x[3] || window.location.protocol != "file:") {
$d.appendChild($a);
}
});
}
var status_init = false;
var status_log = []; // [ status ]
var $status_pro;
function render_status(ret) {
if(!status_init) {
// Clobber the $trans div and use it for status updates
$trans.innerHTML = "<h2>transcription in progress</h2>";
$trans.className = "status";
$status_pro = document.createElement("progress");
$status_pro.setAttribute("min", "0");
$status_pro.setAttribute("max", "100");
$status_pro.value = 0;
$trans.appendChild($status_pro);
status_init = true;
}
if(ret.status !== "TRANSCRIBING") {
if(ret.percent) {
$status_pro.value = (100*ret.percent);
}
}
else if(ret.percent && (status_log.length == 0 || status_log[status_log.length-1].percent+0.0001 < ret.percent)) {
// New entry
var $entry = document.createElement("div");
$entry.className = "entry";
$entry.textContent = ret.message;
ret.$div = $entry;
if(ret.percent) {
$status_pro.value = (100*ret.percent);
}
if(status_log.length > 0) {
$trans.insertBefore($entry, status_log[status_log.length-1].$div);
}
else {
$trans.appendChild($entry);
}
status_log.push(ret);
}
}
function update() {
if(INLINE_JSON) {
// We want this to work from file:/// domains, so we provide a
// mechanism for inlining the alignment data.
render(INLINE_JSON);
// show_downloads();
}
else {
// Show the status
get_json('status.json', function(ret) {
$a.style.visibility = 'hidden';
if (ret.status == 'ERROR') {
$preloader.style.visibility = 'hidden';
$trans.innerHTML = '<b>' + ret.status + ': ' + ret.error + '</b>';
} else if (ret.status == 'TRANSCRIBING' || ret.status == 'ALIGNING') {
$preloader.style.visibility = 'visible';
render_status(ret);
setTimeout(update, 2000);
} else if (ret.status == 'OK') {
// show_downloads();
$preloader.style.visibility = 'hidden';
// XXX: should we fetch the align.json?
// window.location.reload();
$a.style.visibility = 'visible';
render(ret);
} else if (ret.status == 'ENCODING' || ret.status == 'STARTED') {
$preloader.style.visibility = 'visible';
$trans.innerHTML = 'Encoding, please wait...';
setTimeout(update, 2000);
} else {
console.log("unknown status", ret);
$preloader.style.visibility = 'hidden';
$trans.innerHTML = ret.status + '...';
setTimeout(update, 5000);
}
});
}
}
var INLINE_JSON;
update();
</script></body></html>

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.7 KiB

View File

@ -8,12 +8,11 @@ import typer
# import rpyc
# from tqdm import tqdm
# from pydub import AudioSegment
# from pydub.silence import split_on_silence
from plume.utils import lazy_module, lazy_callable
rpyc = lazy_module('rpyc')
AudioSegment = lazy_callable('pydub.AudioSegment')
pydub = lazy_module('pydub')
split_on_silence = lazy_callable('pydub.silence.split_on_silence')
app = typer.Typer()
@ -106,7 +105,7 @@ def triton_transcribe_grpc_gen(
# ]
# pass
transcript_list = []
sil_pad = AudioSegment.silent(duration=sil_msec)
sil_pad = pydub.AudioSegment.silent(duration=sil_msec)
for seg in chunks:
t_seg = sil_pad + seg + sil_pad
c_transcript = transcriber(t_seg)
@ -124,9 +123,7 @@ def triton_transcribe_grpc_gen(
@app.command()
def file(audio_file: Path, write_file: bool = False, chunked=True):
from pydub import AudioSegment
aseg = AudioSegment.from_file(audio_file)
aseg = pydub.AudioSegment.from_file(audio_file)
transcriber, prep = triton_transcribe_grpc_gen()
transcription = transcriber(prep(aseg))
@ -139,10 +136,8 @@ def file(audio_file: Path, write_file: bool = False, chunked=True):
@app.command()
def benchmark(audio_file: Path):
from pydub import AudioSegment
transcriber, audio_prep = transcribe_rpyc_gen()
file_seg = AudioSegment.from_file(audio_file)
file_seg = pydub.AudioSegment.from_file(audio_file)
aud_seg = audio_prep(file_seg)
def timeinfo():

View File

@ -27,6 +27,10 @@ class GoogleTTS(object):
audio_encoding=texttospeech.enums.AudioEncoding.LINEAR16,
sample_rate_hertz=params["sample_rate"],
)
if 'speaking_rate' in params:
audio_config.speaking_rate = params['speaking_rate']
if 'pitch' in params:
audio_config.pitch = params['pitch']
response = self.client.synthesize_speech(tts_input, voice, audio_config)
audio_content = response.audio_content
return audio_content
@ -74,6 +78,19 @@ class GoogleTTS(object):
)
return results
@classmethod
def voice_by_name(cls, name):
"""Lists the available voices."""
# client = cls().client
# Performs the list voices request
results = cls.voice_list()
for voice in results:
if voice['name'] == name:
return voice
raise ValueError(f'{name} not a valid voice')
@app.command()
def generate_audio_file(text, dest_path: Path = "./tts_audio.wav", voice="en-US-Wavenet-D"):

85
plume/utils/ui_persist.py Normal file
View File

@ -0,0 +1,85 @@
from plume.utils import ExtendedPath, get_mongo_conn
from plume.utils.st_rerun import rerun
from uuid import uuid4
from pathlib import Path
def setup_file_state(st):
if not hasattr(st, "state_lock"):
# st.task_id = str(uuid4())
task_path = ExtendedPath("preview.lck")
def current_cursor_fn():
return task_path.read_json()["current_cursor"]
def update_cursor_fn(val=0):
task_path.write_json({"current_cursor": val})
rerun()
st.get_current_cursor = current_cursor_fn
st.update_cursor = update_cursor_fn
st.state_lock = True
# cursor_obj = mongo_conn.find_one({"type": "current_cursor", "task_id": st.task_id})
# if not cursor_obj:
update_cursor_fn(0)
def setup_mongo_asr_validation_state(st):
if not hasattr(st, "mongo_connected"):
st.mongoclient = get_mongo_conn(col="asr_validation")
mongo_conn = st.mongoclient
st.task_id = str(uuid4())
def current_cursor_fn():
# mongo_conn = st.mongoclient
cursor_obj = mongo_conn.find_one(
{"type": "current_cursor", "task_id": st.task_id}
)
cursor_val = cursor_obj["cursor"]
return cursor_val
def update_cursor_fn(val=0):
mongo_conn.find_one_and_update(
{"type": "current_cursor", "task_id": st.task_id},
{
"$set": {
"type": "current_cursor",
"task_id": st.task_id,
"cursor": val,
}
},
upsert=True,
)
rerun()
def get_correction_entry_fn(code):
return mongo_conn.find_one(
{"type": "correction", "code": code}, projection={"_id": False}
)
def update_entry_fn(code, value):
mongo_conn.find_one_and_update(
{"type": "correction", "code": code},
{"$set": {"value": value, "task_id": st.task_id}},
upsert=True,
)
def set_task_fn(data_path, task_id):
if task_id:
st.task_id = task_id
task_path = data_path / Path(f"task-{st.task_id}.lck")
if not task_path.exists():
print(f"creating task lock at {task_path}")
task_path.touch()
st.get_current_cursor = current_cursor_fn
st.update_cursor = update_cursor_fn
st.get_correction_entry = get_correction_entry_fn
st.update_entry = update_entry_fn
st.set_task = set_task_fn
st.mongo_connected = True
cursor_obj = mongo_conn.find_one(
{"type": "current_cursor", "task_id": st.task_id}
)
if not cursor_obj:
update_cursor_fn(0)

205
plume/utils/vad.py Normal file
View File

@ -0,0 +1,205 @@
import logging
import asyncio
import argparse
from pathlib import Path
import webrtcvad
import pydub
from pydub.playback import play
from pydub.utils import make_chunks
DEFAULT_CHUNK_DUR = 20
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
def is_frame_voice(vad, seg, chunk_dur):
return (
True
if (
seg.duration_seconds == chunk_dur / 1000
and vad.is_speech(seg.raw_data, seg.frame_rate)
)
else False
)
class VADFilterAudio(object):
"""docstring for VADFilterAudio."""
def __init__(self, chunk_dur=DEFAULT_CHUNK_DUR):
super(VADFilterAudio, self).__init__()
self.chunk_dur = chunk_dur
self.vad = webrtcvad.Vad()
def filter_segment(self, wav_seg):
chunks = make_chunks(wav_seg, self.chunk_dur)
speech_buffer = b""
for i, c in enumerate(chunks[:-1]):
voice_frame = is_frame_voice(self.vad, c, self.chunk_dur)
if voice_frame:
speech_buffer += c.raw_data
filtered_seg = pydub.AudioSegment(
data=speech_buffer,
frame_rate=wav_seg.frame_rate,
channels=wav_seg.channels,
sample_width=wav_seg.sample_width,
)
return filtered_seg
class VADUtterance(object):
"""docstring for VADUtterance."""
def __init__(
self,
max_silence=500,
min_utterance=280,
max_utterance=20000,
chunk_dur=DEFAULT_CHUNK_DUR,
start_cycles=3,
):
super(VADUtterance, self).__init__()
self.vad = webrtcvad.Vad()
self.chunk_dur = chunk_dur
# duration in millisecs
self.max_sil = max_silence
self.min_utt = min_utterance
self.max_utt = max_utterance
self.speech_start = start_cycles * chunk_dur
def __repr__(self):
return f"VAD(max_silence={self.max_sil},min_utterance:{self.min_utt},max_utterance:{self.max_utt})"
async def stream_utterance(self, audio_stream):
silence_buffer = pydub.AudioSegment.empty()
voice_buffer = pydub.AudioSegment.empty()
silence_threshold = False
async for c in audio_stream:
voice_frame = is_frame_voice(self.vad, c, self.chunk_dur)
logger.debug(f"is audio stream voice? {voice_frame}")
if voice_frame:
silence_threshold = False
voice_buffer += c
silence_buffer = pydub.AudioSegment.empty()
else:
silence_buffer += c
voc_dur = voice_buffer.duration_seconds * 1000
sil_dur = silence_buffer.duration_seconds * 1000
if voc_dur >= self.max_utt:
logger.info(
f"detected voice overflow: voice duration {voice_buffer.duration_seconds}"
)
yield voice_buffer
voice_buffer = pydub.AudioSegment.empty()
if sil_dur >= self.max_sil:
if voc_dur >= self.min_utt:
logger.info(
f"detected silence: voice duration {voice_buffer.duration_seconds}"
)
yield voice_buffer
voice_buffer = pydub.AudioSegment.empty()
# ignore/clear voice if silence reached threshold or indent the statement
if not silence_threshold:
silence_threshold = True
if voice_buffer:
yield voice_buffer
async def stream_events(self, audio_stream):
"""
yields 0, voice_buffer for SpeechBuffer
yields 1, None for StartedSpeaking
yields 2, None for StoppedSpeaking
yields 4, audio_stream
"""
silence_buffer = pydub.AudioSegment.empty()
voice_buffer = pydub.AudioSegment.empty()
silence_threshold, started_speaking = False, False
async for c in audio_stream:
# yield (4, c)
voice_frame = is_frame_voice(self.vad, c, self.chunk_dur)
logger.debug(f"is audio stream voice? {voice_frame}")
if voice_frame:
silence_threshold = False
voice_buffer += c
silence_buffer = pydub.AudioSegment.empty()
else:
silence_buffer += c
voc_dur = voice_buffer.duration_seconds * 1000
sil_dur = silence_buffer.duration_seconds * 1000
if voc_dur >= self.speech_start and not started_speaking:
started_speaking = True
yield (1, None)
if voc_dur >= self.max_utt:
logger.info(
f"detected voice overflow: voice duration {voice_buffer.duration_seconds}"
)
yield (0, voice_buffer)
voice_buffer = pydub.AudioSegment.empty()
started_speaking = False
if sil_dur >= self.max_sil:
if voc_dur >= self.min_utt:
logger.info(
f"detected silence: voice duration {voice_buffer.duration_seconds}"
)
yield (0, voice_buffer)
voice_buffer = pydub.AudioSegment.empty()
started_speaking = False
# ignore/clear voice if silence reached threshold or indent the statement
if not silence_threshold:
silence_threshold = True
yield (2, None)
if voice_buffer:
yield (0, voice_buffer)
@classmethod
async def stream_utterance_file(cls, audio_file):
async def stream_gen():
audio_seg = pydub.AudioSegment.from_file(audio_file).set_frame_rate(32000)
chunks = make_chunks(audio_seg, DEFAULT_CHUNK_DUR)
for c in chunks:
yield c
va_ut = cls()
buffer_src = va_ut.stream_utterance(stream_gen())
async for buf in buffer_src:
play(buf)
await asyncio.sleep(1)
class VADStreamGen(object):
"""docstring for VADStreamGen."""
def __init__(self, arg):
super(VADStreamGen, self).__init__()
self.arg = arg
def main():
prog = Path(__file__).stem
parser = argparse.ArgumentParser(prog=prog, description="transcribes audio file")
parser.add_argument(
"--audio_file",
type=argparse.FileType("rb"),
help="audio file to transcribe",
default="./test_utter2.wav",
)
args = parser.parse_args()
loop = asyncio.get_event_loop()
loop.run_until_complete(VADUtterance.stream_utterance_file(args.audio_file))
if __name__ == "__main__":
main()

View File

@ -58,6 +58,9 @@ extra_requirements = {
"stringcase~=1.2.0",
"google-cloud-speech~=1.3.1",
],
"ui": [
"rangehttpserver~=1.2.0",
],
"train": ["torchaudio~=0.6.0", "torch-stft~=0.1.4"],
}