1. Self contained typers
2. Asr force-aligner visualization 3. streamlit state management abstraction 4. new utils / reorganize 5. added verbose flags 6. add tts by nametegra
parent
f72c6bbe5b
commit
c474aa5f5a
|
|
@ -0,0 +1 @@
|
|||
graft plume/utils/gentle_preview
|
||||
|
|
@ -7,12 +7,12 @@ from .eval import app as eval_app
|
|||
from .serve import app as serve_app
|
||||
|
||||
app = typer.Typer()
|
||||
app.add_typer(data_app, name="data")
|
||||
app.add_typer(ui_app, name="ui")
|
||||
app.add_typer(train_app, name="train")
|
||||
app.add_typer(eval_app, name="eval")
|
||||
app.add_typer(serve_app, name="serve")
|
||||
app.add_typer(utils_app, name='utils')
|
||||
app.add_typer(data_app)
|
||||
app.add_typer(ui_app)
|
||||
app.add_typer(train_app)
|
||||
app.add_typer(eval_app)
|
||||
app.add_typer(serve_app)
|
||||
app.add_typer(utils_app)
|
||||
|
||||
|
||||
def main():
|
||||
|
|
|
|||
|
|
@ -27,6 +27,13 @@ app.add_typer(generate_app, name="generate")
|
|||
app.add_typer(wav2vec2_app, name="wav2vec2")
|
||||
|
||||
|
||||
@app.callback()
|
||||
def data():
|
||||
"""
|
||||
data sub commands
|
||||
"""
|
||||
|
||||
|
||||
@app.command()
|
||||
def fix_path(dataset_path: Path, force: bool = False):
|
||||
manifest_path = dataset_path / Path("manifest.json")
|
||||
|
|
|
|||
|
|
@ -3,3 +3,10 @@ from ..models.wav2vec2.eval import app as wav2vec2_app
|
|||
|
||||
app = typer.Typer()
|
||||
app.add_typer(wav2vec2_app, name="wav2vec2")
|
||||
|
||||
|
||||
@app.callback()
|
||||
def eval():
|
||||
"""
|
||||
eval sub commands
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -5,3 +5,10 @@ from ..models.jasper.serve import app as jasper_app
|
|||
app = typer.Typer()
|
||||
app.add_typer(wav2vec2_app, name="wav2vec2")
|
||||
app.add_typer(jasper_app, name="jasper")
|
||||
|
||||
|
||||
@app.callback()
|
||||
def serve():
|
||||
"""
|
||||
serve sub commands
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -1,5 +1,12 @@
|
|||
import typer
|
||||
from ..models.wav2vec2.train import app as train_app
|
||||
from ..models.wav2vec2.train import app as wav2vec2_app
|
||||
|
||||
app = typer.Typer()
|
||||
app.add_typer(train_app, name="wav2vec2")
|
||||
app.add_typer(wav2vec2_app, name="wav2vec2")
|
||||
|
||||
|
||||
@app.callback()
|
||||
def train():
|
||||
"""
|
||||
train sub commands
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -3,12 +3,20 @@ import sys
|
|||
from pathlib import Path
|
||||
|
||||
from plume.utils import lazy_module
|
||||
|
||||
# from streamlit import cli as stcli
|
||||
|
||||
stcli = lazy_module('streamlit.cli')
|
||||
stcli = lazy_module("streamlit.cli")
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
@app.callback()
|
||||
def ui():
|
||||
"""
|
||||
ui sub commands
|
||||
"""
|
||||
|
||||
|
||||
@app.command()
|
||||
def annotation(data_dir: Path, dump_fname: Path = "ui_dump.json", task_id: str = ""):
|
||||
annotation_lit_path = Path(__file__).parent / Path("annotation.py")
|
||||
|
|
@ -40,13 +48,7 @@ def annotation(data_dir: Path, dump_fname: Path = "ui_dump.json", task_id: str =
|
|||
@app.command()
|
||||
def preview(manifest_path: Path):
|
||||
annotation_lit_path = Path(__file__).parent / Path("preview.py")
|
||||
sys.argv = [
|
||||
"streamlit",
|
||||
"run",
|
||||
str(annotation_lit_path),
|
||||
"--",
|
||||
str(manifest_path)
|
||||
]
|
||||
sys.argv = ["streamlit", "run", str(annotation_lit_path), "--", str(manifest_path)]
|
||||
sys.exit(stcli.main())
|
||||
|
||||
|
||||
|
|
@ -56,6 +58,18 @@ def collection(data_dir: Path, task_id: str = ""):
|
|||
pass
|
||||
|
||||
|
||||
@app.command()
|
||||
def alignment(preview_dir: Path, port: int = 8010):
|
||||
from RangeHTTPServer import RangeRequestHandler
|
||||
from functools import partial
|
||||
from http.server import HTTPServer
|
||||
|
||||
server_address = ("", port)
|
||||
handler_class = partial(RangeRequestHandler, directory=str(preview_dir))
|
||||
httpd = HTTPServer(server_address, handler_class)
|
||||
httpd.serve_forever()
|
||||
|
||||
|
||||
def main():
|
||||
app()
|
||||
|
||||
|
|
|
|||
|
|
@ -1,66 +1,14 @@
|
|||
# import sys
|
||||
from pathlib import Path
|
||||
from uuid import uuid4
|
||||
|
||||
import streamlit as st
|
||||
import typer
|
||||
|
||||
from plume.utils import ExtendedPath, get_mongo_conn
|
||||
from plume.preview.st_rerun import rerun
|
||||
from plume.utils import ExtendedPath
|
||||
from plume.utils.ui_persist import setup_mongo_asr_validation_state
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
if not hasattr(st, "mongo_connected"):
|
||||
st.mongoclient = get_mongo_conn(col="asr_validation")
|
||||
mongo_conn = st.mongoclient
|
||||
st.task_id = str(uuid4())
|
||||
|
||||
def current_cursor_fn():
|
||||
# mongo_conn = st.mongoclient
|
||||
cursor_obj = mongo_conn.find_one(
|
||||
{"type": "current_cursor", "task_id": st.task_id}
|
||||
)
|
||||
cursor_val = cursor_obj["cursor"]
|
||||
return cursor_val
|
||||
|
||||
def update_cursor_fn(val=0):
|
||||
mongo_conn.find_one_and_update(
|
||||
{"type": "current_cursor", "task_id": st.task_id},
|
||||
{"$set": {"type": "current_cursor", "task_id": st.task_id, "cursor": val}},
|
||||
upsert=True,
|
||||
)
|
||||
rerun()
|
||||
|
||||
def get_correction_entry_fn(code):
|
||||
return mongo_conn.find_one(
|
||||
{"type": "correction", "code": code}, projection={"_id": False}
|
||||
)
|
||||
|
||||
def update_entry_fn(code, value):
|
||||
mongo_conn.find_one_and_update(
|
||||
{"type": "correction", "code": code},
|
||||
{"$set": {"value": value, "task_id": st.task_id}},
|
||||
upsert=True,
|
||||
)
|
||||
|
||||
def set_task_fn(data_path, task_id):
|
||||
if task_id:
|
||||
st.task_id = task_id
|
||||
task_path = data_path / Path(f"task-{st.task_id}.lck")
|
||||
if not task_path.exists():
|
||||
print(f"creating task lock at {task_path}")
|
||||
task_path.touch()
|
||||
|
||||
st.get_current_cursor = current_cursor_fn
|
||||
st.update_cursor = update_cursor_fn
|
||||
st.get_correction_entry = get_correction_entry_fn
|
||||
st.update_entry = update_entry_fn
|
||||
st.set_task = set_task_fn
|
||||
st.mongo_connected = True
|
||||
cursor_obj = mongo_conn.find_one({"type": "current_cursor", "task_id": st.task_id})
|
||||
if not cursor_obj:
|
||||
update_cursor_fn(0)
|
||||
setup_mongo_asr_validation_state(st)
|
||||
|
||||
|
||||
@st.cache()
|
||||
|
|
|
|||
|
|
@ -3,27 +3,11 @@ from pathlib import Path
|
|||
import streamlit as st
|
||||
import typer
|
||||
from plume.utils import ExtendedPath
|
||||
from plume.preview.st_rerun import rerun
|
||||
from plume.utils.ui_persist import setup_file_state
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
if not hasattr(st, "state_lock"):
|
||||
# st.task_id = str(uuid4())
|
||||
task_path = ExtendedPath("preview.lck")
|
||||
|
||||
def current_cursor_fn():
|
||||
return task_path.read_json()["current_cursor"]
|
||||
|
||||
def update_cursor_fn(val=0):
|
||||
task_path.write_json({"current_cursor": val})
|
||||
rerun()
|
||||
|
||||
st.get_current_cursor = current_cursor_fn
|
||||
st.update_cursor = update_cursor_fn
|
||||
st.state_lock = True
|
||||
# cursor_obj = mongo_conn.find_one({"type": "current_cursor", "task_id": st.task_id})
|
||||
# if not cursor_obj:
|
||||
update_cursor_fn(0)
|
||||
setup_file_state(st)
|
||||
|
||||
|
||||
@st.cache()
|
||||
|
|
@ -40,7 +24,7 @@ def main(manifest: Path):
|
|||
print("Invalid samplno resetting to 0")
|
||||
st.update_cursor(0)
|
||||
sample = asr_data[sample_no]
|
||||
st.title(f"ASR Manifest Preview")
|
||||
st.title("ASR Manifest Preview")
|
||||
st.markdown(f"{sample_no+1} of {len(asr_data)} : **{sample['text']}**")
|
||||
new_sample = st.number_input(
|
||||
"Go To Sample:", value=sample_no + 1, min_value=1, max_value=len(asr_data)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,151 @@
|
|||
/data/
|
||||
/model/
|
||||
/train/
|
||||
.env*
|
||||
*.yaml
|
||||
*.yml
|
||||
*.json
|
||||
|
||||
|
||||
# Created by https://www.gitignore.io/api/python
|
||||
# Edit at https://www.gitignore.io/?templates=python
|
||||
|
||||
### Python ###
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# Mr Developer
|
||||
.mr.developer.cfg
|
||||
.project
|
||||
.pydevproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# End of https://www.gitignore.io/api/python
|
||||
|
||||
# Created by https://www.gitignore.io/api/macos
|
||||
# Edit at https://www.gitignore.io/?templates=macos
|
||||
|
||||
### macOS ###
|
||||
# General
|
||||
.DS_Store
|
||||
.AppleDouble
|
||||
.LSOverride
|
||||
|
||||
# Icon must end with two \r
|
||||
Icon
|
||||
|
||||
# Thumbnails
|
||||
._*
|
||||
|
||||
# Files that might appear in the root of a volume
|
||||
.DocumentRevisions-V100
|
||||
.fseventsd
|
||||
.Spotlight-V100
|
||||
.TemporaryItems
|
||||
.Trashes
|
||||
.VolumeIcon.icns
|
||||
.com.apple.timemachine.donotpresent
|
||||
|
||||
# Directories potentially created on remote AFP share
|
||||
.AppleDB
|
||||
.AppleDesktop
|
||||
Network Trash Folder
|
||||
Temporary Items
|
||||
.apdisk
|
||||
|
||||
# End of https://www.gitignore.io/api/macos
|
||||
|
|
@ -11,12 +11,14 @@ from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
|
|||
import subprocess
|
||||
import shutil
|
||||
from urllib.parse import urlsplit
|
||||
|
||||
# from .lazy_loader import LazyLoader
|
||||
from .lazy_import import lazy_callable, lazy_module
|
||||
|
||||
# from ruamel.yaml import YAML
|
||||
# import boto3
|
||||
import typer
|
||||
|
||||
# import pymongo
|
||||
# from slugify import slugify
|
||||
# import pydub
|
||||
|
|
@ -34,16 +36,16 @@ from .tts import app as tts_app
|
|||
from .transcribe import app as transcribe_app
|
||||
from .align import app as align_app
|
||||
|
||||
boto3 = lazy_module('boto3')
|
||||
pymongo = lazy_module('pymongo')
|
||||
pydub = lazy_module('pydub')
|
||||
audio_display = lazy_module('librosa.display')
|
||||
plt = lazy_module('matplotlib.pyplot')
|
||||
librosa = lazy_module('librosa')
|
||||
YAML = lazy_callable('ruamel.yaml.YAML')
|
||||
num2words = lazy_callable('num2words.num2words')
|
||||
slugify = lazy_callable('slugify.slugify')
|
||||
compress = lazy_callable('natural.date.compress')
|
||||
boto3 = lazy_module("boto3")
|
||||
pymongo = lazy_module("pymongo")
|
||||
pydub = lazy_module("pydub")
|
||||
audio_display = lazy_module("librosa.display")
|
||||
plt = lazy_module("matplotlib.pyplot")
|
||||
librosa = lazy_module("librosa")
|
||||
YAML = lazy_callable("ruamel.yaml.YAML")
|
||||
num2words = lazy_callable("num2words.num2words")
|
||||
slugify = lazy_callable("slugify.slugify")
|
||||
compress = lazy_callable("natural.date.compress")
|
||||
|
||||
app = typer.Typer()
|
||||
app.add_typer(tts_app, name="tts")
|
||||
|
|
@ -51,6 +53,13 @@ app.add_typer(align_app, name="align")
|
|||
app.add_typer(transcribe_app, name="transcribe")
|
||||
|
||||
|
||||
@app.callback()
|
||||
def utils():
|
||||
"""
|
||||
utils sub commands
|
||||
"""
|
||||
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
|
|
@ -125,6 +134,10 @@ def upload_s3(dataset_path, s3_path):
|
|||
run_shell(f"aws s3 sync {dataset_path} {s3_path}")
|
||||
|
||||
|
||||
def copy_s3(dataset_path, s3_path):
|
||||
run_shell(f"aws s3 cp {dataset_path} {s3_path}")
|
||||
|
||||
|
||||
def get_download_path(s3_uri, output_path):
|
||||
s3_uri_p = urlsplit(s3_uri)
|
||||
download_path = output_path / Path(s3_uri_p.path[1:])
|
||||
|
|
@ -135,11 +148,12 @@ def get_download_path(s3_uri, output_path):
|
|||
def s3_downloader():
|
||||
s3 = boto3.client("s3")
|
||||
|
||||
def download_s3(s3_uri, download_path):
|
||||
def download_s3(s3_uri, download_path, verbose=False):
|
||||
s3_uri_p = urlsplit(s3_uri)
|
||||
download_path.parent.mkdir(exist_ok=True, parents=True)
|
||||
if not download_path.exists():
|
||||
print(f"downloading {s3_uri} to {download_path}")
|
||||
if verbose:
|
||||
print(f"downloading {s3_uri} to {download_path}")
|
||||
s3.download_file(s3_uri_p.netloc, s3_uri_p.path[1:], str(download_path))
|
||||
|
||||
return download_s3
|
||||
|
|
@ -186,6 +200,7 @@ def ui_data_generator(dataset_dir, asr_data_source, verbose=False):
|
|||
plot_seg(wav_plot_path.absolute(), audio_file)
|
||||
return {
|
||||
"audio_path": str(rel_data_path),
|
||||
"audio_filepath": str(rel_data_path),
|
||||
"duration": round(audio_dur, 1),
|
||||
"text": transcript,
|
||||
"real_idx": num_datapoints,
|
||||
|
|
@ -229,17 +244,17 @@ def ui_dump_manifest_writer(dataset_dir, asr_data_source, verbose=False):
|
|||
)
|
||||
|
||||
asr_manifest = dataset_dir / Path("manifest.json")
|
||||
with asr_manifest.open("w") as mf:
|
||||
print(f"writing manifest to {asr_manifest}")
|
||||
for d in dump_data:
|
||||
rel_data_path = d["audio_path"]
|
||||
audio_dur = d["duration"]
|
||||
transcript = d["text"]
|
||||
manifest = manifest_str(str(rel_data_path), audio_dur, transcript)
|
||||
mf.write(manifest)
|
||||
|
||||
asr_manifest_writer(asr_manifest, dump_data, verbose=verbose)
|
||||
# with asr_manifest.open("w") as mf:
|
||||
# print(f"writing manifest to {asr_manifest}")
|
||||
# for d in dump_data:
|
||||
# rel_data_path = d["audio_path"]
|
||||
# audio_dur = d["duration"]
|
||||
# transcript = d["text"]
|
||||
# manifest = manifest_str(str(rel_data_path), audio_dur, transcript)
|
||||
# mf.write(manifest)
|
||||
ui_dump_file = dataset_dir / Path("ui_dump.json")
|
||||
ExtendedPath(ui_dump_file).write_json({"data": dump_data})
|
||||
ExtendedPath(ui_dump_file).write_json({"data": dump_data}, verbose=verbose)
|
||||
return num_datapoints
|
||||
|
||||
|
||||
|
|
@ -254,9 +269,10 @@ def asr_manifest_reader(data_manifest_path: Path):
|
|||
yield p
|
||||
|
||||
|
||||
def asr_manifest_writer(asr_manifest_path: Path, manifest_str_source):
|
||||
def asr_manifest_writer(asr_manifest_path: Path, manifest_str_source, verbose=False):
|
||||
with asr_manifest_path.open("w") as mf:
|
||||
print(f"opening {asr_manifest_path} for writing manifest")
|
||||
if verbose:
|
||||
print(f"writing asr manifest to {asr_manifest_path}")
|
||||
for mani_dict in manifest_str_source:
|
||||
manifest = manifest_str(
|
||||
mani_dict["audio_filepath"], mani_dict["duration"], mani_dict["text"]
|
||||
|
|
@ -293,37 +309,43 @@ def batch(iterable, n=1):
|
|||
class ExtendedPath(type(Path())):
|
||||
"""docstring for ExtendedPath."""
|
||||
|
||||
def read_json(self):
|
||||
print(f"reading json from {self}")
|
||||
def read_json(self, verbose=False):
|
||||
if verbose:
|
||||
print(f"reading json from {self}")
|
||||
with self.open("r") as jf:
|
||||
return json.load(jf)
|
||||
|
||||
def read_yaml(self):
|
||||
def read_yaml(self, verbose=False):
|
||||
yaml = YAML(typ="safe", pure=True)
|
||||
print(f"reading yaml from {self}")
|
||||
if verbose:
|
||||
print(f"reading yaml from {self}")
|
||||
with self.open("r") as yf:
|
||||
return yaml.load(yf)
|
||||
|
||||
def read_jsonl(self):
|
||||
print(f"reading jsonl from {self}")
|
||||
def read_jsonl(self, verbose=False):
|
||||
if verbose:
|
||||
print(f"reading jsonl from {self}")
|
||||
with self.open("r") as jf:
|
||||
for l in jf.readlines():
|
||||
yield json.loads(l)
|
||||
for ln in jf.readlines():
|
||||
yield json.loads(ln)
|
||||
|
||||
def write_json(self, data):
|
||||
print(f"writing json to {self}")
|
||||
def write_json(self, data, verbose=False):
|
||||
if verbose:
|
||||
print(f"writing json to {self}")
|
||||
self.parent.mkdir(parents=True, exist_ok=True)
|
||||
with self.open("w") as jf:
|
||||
json.dump(data, jf, indent=2)
|
||||
|
||||
def write_yaml(self, data):
|
||||
def write_yaml(self, data, verbose=False):
|
||||
yaml = YAML()
|
||||
print(f"writing yaml to {self}")
|
||||
if verbose:
|
||||
print(f"writing yaml to {self}")
|
||||
with self.open("w") as yf:
|
||||
yaml.dump(data, yf)
|
||||
|
||||
def write_jsonl(self, data):
|
||||
print(f"writing jsonl to {self}")
|
||||
def write_jsonl(self, data, verbose=False):
|
||||
if verbose:
|
||||
print(f"writing jsonl to {self}")
|
||||
self.parent.mkdir(parents=True, exist_ok=True)
|
||||
with self.open("w") as jf:
|
||||
for d in data:
|
||||
|
|
|
|||
|
|
@ -1,12 +1,14 @@
|
|||
from pathlib import Path
|
||||
from .tts import GoogleTTS
|
||||
# from IPython import display
|
||||
import requests
|
||||
import io
|
||||
import typer
|
||||
import shutil
|
||||
|
||||
import typer
|
||||
from plume.utils import lazy_module
|
||||
|
||||
from .tts import GoogleTTS
|
||||
|
||||
display = lazy_module('IPython.display')
|
||||
pydub = lazy_module('pydub')
|
||||
|
||||
|
|
@ -63,16 +65,19 @@ def gentle_preview(
|
|||
audio_path: Path,
|
||||
transcript_path: Path,
|
||||
service_uri="http://101.53.142.218:8765/transcriptions",
|
||||
gent_preview_dir="../gentle_preview",
|
||||
gent_preview_dir="./gentle_preview",
|
||||
):
|
||||
from . import ExtendedPath
|
||||
|
||||
ab = audio_path.read_bytes()
|
||||
tt = transcript_path.read_text()
|
||||
audio, alignment = gentle_aligner(service_uri, ab, tt)
|
||||
audio.export(gent_preview_dir / Path("a.wav"), format="wav")
|
||||
alignment["status"] = "OK"
|
||||
ExtendedPath(gent_preview_dir / Path("status.json")).write_json(alignment)
|
||||
pkg_gentle_dir = Path(__file__).parent / 'gentle_preview'
|
||||
|
||||
shutil.copytree(str(pkg_gentle_dir), str(gent_preview_dir))
|
||||
# ab = audio_path.read_bytes()
|
||||
# tt = transcript_path.read_text()
|
||||
# audio, alignment = gentle_aligner(service_uri, ab, tt)
|
||||
# audio.export(gent_preview_dir / Path("a.wav"), format="wav")
|
||||
# alignment["status"] = "OK"
|
||||
# ExtendedPath(gent_preview_dir / Path("status.json")).write_json(alignment)
|
||||
|
||||
|
||||
def main():
|
||||
|
|
|
|||
|
|
@ -0,0 +1,5 @@
|
|||
Serve with https://github.com/danvk/RangeHTTPServer
|
||||
`https://github.com/claysciences/CORSRangeHTTPServer`
|
||||
|
||||
`python -m RangeHTTPServer`
|
||||
`python -m http.server`
|
||||
|
|
@ -0,0 +1,80 @@
|
|||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<style>
|
||||
body {font-family: sans-serif; padding-top: 70px; }
|
||||
textarea { width: 500px; height: 20em; }
|
||||
input, textarea { margin: 1em 0; }
|
||||
#header {
|
||||
position: fixed;
|
||||
top: 0;
|
||||
left: 0;
|
||||
height: 50px;
|
||||
line-height: 50px;
|
||||
width: 100%;
|
||||
background-color: #999;
|
||||
box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5);
|
||||
font-family: Helvetica, sans-serif;
|
||||
}
|
||||
#header, #header a {
|
||||
color: white;
|
||||
}
|
||||
.home {
|
||||
margin: 0;
|
||||
font-size: 125%;
|
||||
font-weight: lighter;
|
||||
text-transform: lowercase;
|
||||
}
|
||||
.home a {
|
||||
margin: 0;
|
||||
background: #666;
|
||||
padding-left: 25px;
|
||||
padding-right: 30px;
|
||||
margin-right: 20px;
|
||||
float: left;
|
||||
text-decoration: none;
|
||||
}
|
||||
.home:hover a {
|
||||
background: #555;
|
||||
}
|
||||
#align-button {
|
||||
background: #CCC;
|
||||
border: 0;
|
||||
font-size: 18px;
|
||||
padding: 10px 30px;
|
||||
cursor: pointer;
|
||||
}
|
||||
#alignment-flags {
|
||||
background: #CCC;
|
||||
border: 0;
|
||||
font-size: 18px;
|
||||
padding: 10px 30px;
|
||||
}
|
||||
#footer {
|
||||
margin-top: 100px;
|
||||
border-top: 1px dotted black;
|
||||
font-size: 8pt;
|
||||
font-style: italic;
|
||||
padding: 10px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="header">
|
||||
<h1 class="home"><a href="/">Gentle</a></h1>
|
||||
</div>
|
||||
<form action="/transcriptions" method="POST" enctype="multipart/form-data">
|
||||
Audio:<br>
|
||||
<input type=file name=audio><br>
|
||||
<br>
|
||||
Transcript:<br>
|
||||
<textarea name="transcript"></textarea><br>
|
||||
<input id=alignment-flags name=conservative type=checkbox> Conservative<br>
|
||||
<input id=alignment-flags name=disfluency type=checkbox> Include disfluencies<br>
|
||||
<input id="align-button" type=submit value=Align>
|
||||
</form>
|
||||
<div id="footer">
|
||||
<a href="https://lowerquality.com/gentle">Gentle</a> is free software released under the <a href="https://opensource.org/licenses/MIT">MIT license</a>. <a href="https://lowerquality.com/gentle">Homepage</a> | <a href="https://github.com/lowerquality/gentle">Source code</a>.
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -0,0 +1,408 @@
|
|||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<style>
|
||||
html, body {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
#header {
|
||||
position: fixed;
|
||||
top: 0;
|
||||
left: 0;
|
||||
height: 50px;
|
||||
line-height: 50px;
|
||||
width: 100%;
|
||||
background-color: #999;
|
||||
box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5);
|
||||
font-family: Helvetica, sans-serif;
|
||||
}
|
||||
#header, #header a {
|
||||
color: white;
|
||||
}
|
||||
#downloads {
|
||||
float: right;
|
||||
background: #999;
|
||||
}
|
||||
.download {
|
||||
float: right;
|
||||
background: #999;
|
||||
padding: 0 5px;
|
||||
}
|
||||
.home {
|
||||
margin: 0;
|
||||
font-size: 125%;
|
||||
font-weight: lighter;
|
||||
text-transform: lowercase;
|
||||
}
|
||||
.home a {
|
||||
margin: 0;
|
||||
background: #666;
|
||||
padding-left: 25px;
|
||||
padding-right: 30px;
|
||||
margin-right: 20px;
|
||||
float: left;
|
||||
text-decoration: none;
|
||||
}
|
||||
.home:hover a {
|
||||
background: #555;
|
||||
}
|
||||
#audio {
|
||||
margin-top: 9px;
|
||||
width: 50%;
|
||||
display: inline-block;
|
||||
}
|
||||
#transcript {
|
||||
margin: 0 15px;
|
||||
margin-top: 70px;
|
||||
margin-bottom: 5em;
|
||||
white-space: pre-wrap;
|
||||
line-height: 2em;
|
||||
max-width: 600px;
|
||||
color: #999;
|
||||
}
|
||||
#transcript.status {
|
||||
background-color: #333;
|
||||
color: #fff;
|
||||
font-family: Courier, mono;
|
||||
line-height: 1em;
|
||||
font-size: 10pt;
|
||||
max-width: 100%;
|
||||
}
|
||||
#transcript.status h2 {
|
||||
padding: 10px;
|
||||
}
|
||||
#transcript.status .entry {
|
||||
margin-bottom: 10px;
|
||||
padding: 10px;
|
||||
}
|
||||
#transcript.status progress {
|
||||
width: 100%;
|
||||
height: 30px;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
.success {
|
||||
color: black;
|
||||
}
|
||||
.success:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
.active {
|
||||
color: magenta;
|
||||
}
|
||||
#preloader {
|
||||
visibility: hidden;
|
||||
}
|
||||
.phactive {
|
||||
text-decoration: underline;
|
||||
}
|
||||
.phones {
|
||||
position: absolute;
|
||||
color: #333;
|
||||
}
|
||||
.phones .phone {
|
||||
margin-right: 5px;
|
||||
font-family: Helvetica, sans-serif;
|
||||
text-transform: uppercase;
|
||||
font-size: 50%;
|
||||
}
|
||||
.phones .phone:last-child {
|
||||
margin-right: 0;
|
||||
}
|
||||
#footer {
|
||||
margin-top: 100px;
|
||||
border-top: 1px dotted black;
|
||||
font-size: 8pt;
|
||||
font-style: italic;
|
||||
font-family: Helvetica, sans-serif;
|
||||
padding: 10px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="header">
|
||||
<!-- <h1 class="home"><a href="/">Gentle</a></h1> -->
|
||||
<audio id="audio" src="a.wav" controls="true" preload="auto"></audio>
|
||||
<img src="/preloader.gif" id="preloader" alt="loading...">
|
||||
<span id="downloads"> </div>
|
||||
</div>
|
||||
<div id="transcript"></div>
|
||||
<!-- <div id="footer">
|
||||
<a href="https://lowerquality.com/gentle">Gentle</a> is free software released under the <a href="https://opensource.org/licenses/MIT">MIT license</a>. <a href="https://lowerquality.com/gentle">Homepage</a> | <a href="https://github.com/lowerquality/gentle">Source code</a>.
|
||||
</div> -->
|
||||
|
||||
<script>
|
||||
|
||||
function get(url, cb) {
|
||||
var xhr = new XMLHttpRequest();
|
||||
xhr.open("GET", url, true);
|
||||
xhr.onload = function() {
|
||||
cb(this.responseText);
|
||||
}
|
||||
xhr.send();
|
||||
}
|
||||
function get_json(url, cb) {
|
||||
get(url, function(x) {
|
||||
cb(JSON.parse(x));
|
||||
});
|
||||
}
|
||||
|
||||
var $a = document.getElementById("audio");
|
||||
window.onkeydown = function(ev) {
|
||||
if(ev.keyCode == 32) {
|
||||
ev.preventDefault();
|
||||
$a.pause();
|
||||
}
|
||||
}
|
||||
|
||||
var $trans = document.getElementById("transcript");
|
||||
var $preloader = document.getElementById('preloader');
|
||||
|
||||
var wds = [];
|
||||
var cur_wd;
|
||||
|
||||
var $phones = document.createElement("div");
|
||||
$phones.className = "phones";
|
||||
document.body.appendChild($phones);
|
||||
|
||||
var cur_phones$ = []; // List of phoneme $divs
|
||||
var $active_phone;
|
||||
|
||||
function render_phones(wd) {
|
||||
cur_phones$ = [];
|
||||
$phones.innerHTML = "";
|
||||
$active_phone = null;
|
||||
|
||||
$phones.style.top = wd.$div.offsetTop + 18;
|
||||
$phones.style.left = wd.$div.offsetLeft;
|
||||
|
||||
var dur = wd.end - wd.start;
|
||||
|
||||
var start_x = wd.$div.offsetLeft;
|
||||
|
||||
wd.phones
|
||||
.forEach(function(ph){
|
||||
var $p = document.createElement("span");
|
||||
$p.className = "phone";
|
||||
$p.textContent = ph.phone.split("_")[0];
|
||||
|
||||
$phones.appendChild($p);
|
||||
cur_phones$.push($p);
|
||||
});
|
||||
|
||||
var offsetToCenter = (wd.$div.offsetWidth - $phones.offsetWidth) / 2;
|
||||
$phones.style.left = wd.$div.offsetLeft + offsetToCenter;
|
||||
}
|
||||
function highlight_phone(t) {
|
||||
if(!cur_wd) {
|
||||
$phones.innerHTML = "";
|
||||
return;
|
||||
}
|
||||
var hit;
|
||||
var cur_t = cur_wd.start;
|
||||
|
||||
cur_wd.phones.forEach(function(ph, idx) {
|
||||
if(cur_t <= t && cur_t + ph.duration >= t) {
|
||||
hit = idx;
|
||||
}
|
||||
cur_t += ph.duration;
|
||||
});
|
||||
|
||||
if(hit) {
|
||||
var $ph = cur_phones$[hit];
|
||||
if($ph != $active_phone) {
|
||||
if($active_phone) {
|
||||
$active_phone.classList.remove("phactive");
|
||||
}
|
||||
if($ph) {
|
||||
$ph.classList.add("phactive");
|
||||
}
|
||||
}
|
||||
$active_phone = $ph;
|
||||
}
|
||||
}
|
||||
|
||||
function highlight_word() {
|
||||
var t = $a.currentTime;
|
||||
// XXX: O(N); use binary search
|
||||
var hits = wds.filter(function(x) {
|
||||
return (t - x.start) > 0.01 && (x.end - t) > 0.01;
|
||||
}, wds);
|
||||
var next_wd = hits[hits.length - 1];
|
||||
|
||||
if(cur_wd != next_wd) {
|
||||
var active = document.querySelectorAll('.active');
|
||||
for(var i = 0; i < active.length; i++) {
|
||||
active[i].classList.remove('active');
|
||||
}
|
||||
if(next_wd && next_wd.$div) {
|
||||
next_wd.$div.classList.add('active');
|
||||
render_phones(next_wd);
|
||||
}
|
||||
}
|
||||
cur_wd = next_wd;
|
||||
highlight_phone(t);
|
||||
|
||||
window.requestAnimationFrame(highlight_word);
|
||||
}
|
||||
window.requestAnimationFrame(highlight_word);
|
||||
|
||||
$trans.innerHTML = "Loading...";
|
||||
|
||||
function render(ret) {
|
||||
wds = ret['words'] || [];
|
||||
transcript = ret['transcript'];
|
||||
|
||||
$trans.innerHTML = '';
|
||||
|
||||
var currentOffset = 0;
|
||||
|
||||
wds.forEach(function(wd) {
|
||||
if(wd.case == 'not-found-in-transcript') {
|
||||
// TODO: show phonemes somewhere
|
||||
var txt = ' ' + wd.word;
|
||||
var $plaintext = document.createTextNode(txt);
|
||||
$trans.appendChild($plaintext);
|
||||
return;
|
||||
}
|
||||
|
||||
// Add non-linked text
|
||||
if(wd.startOffset > currentOffset) {
|
||||
var txt = transcript.slice(currentOffset, wd.startOffset);
|
||||
var $plaintext = document.createTextNode(txt);
|
||||
$trans.appendChild($plaintext);
|
||||
currentOffset = wd.startOffset;
|
||||
}
|
||||
|
||||
var $wd = document.createElement('span');
|
||||
var txt = transcript.slice(wd.startOffset, wd.endOffset);
|
||||
var $wdText = document.createTextNode(txt);
|
||||
$wd.appendChild($wdText);
|
||||
wd.$div = $wd;
|
||||
if(wd.start !== undefined) {
|
||||
$wd.className = 'success';
|
||||
}
|
||||
$wd.onclick = function() {
|
||||
if(wd.start !== undefined) {
|
||||
console.log(wd.start);
|
||||
$a.currentTime = wd.start;
|
||||
$a.play();
|
||||
}
|
||||
};
|
||||
$trans.appendChild($wd);
|
||||
currentOffset = wd.endOffset;
|
||||
});
|
||||
|
||||
var txt = transcript.slice(currentOffset, transcript.length);
|
||||
var $plaintext = document.createTextNode(txt);
|
||||
$trans.appendChild($plaintext);
|
||||
currentOffset = transcript.length;
|
||||
}
|
||||
|
||||
function show_downloads() {
|
||||
var $d = document.getElementById("downloads");
|
||||
$d.textContent = "Download as: ";
|
||||
var uid = window.location.pathname.split("/")[2];
|
||||
// Name, path, title, inhibit-on-file:///
|
||||
[["CSV", "align.csv", "Word alignment CSV"],
|
||||
["JSON", "align.json", "JSON word/phoneme alignment data"],
|
||||
["Zip", "/zip/" + uid + ".zip", "Standalone zipfile", true]]
|
||||
.forEach(function(x) {
|
||||
var $a = document.createElement("a");
|
||||
$a.className = "download";
|
||||
$a.textContent = x[0];
|
||||
$a.href = x[1];
|
||||
$a.title = x[2];
|
||||
if(!x[3] || window.location.protocol != "file:") {
|
||||
$d.appendChild($a);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
var status_init = false;
|
||||
var status_log = []; // [ status ]
|
||||
var $status_pro;
|
||||
|
||||
function render_status(ret) {
|
||||
if(!status_init) {
|
||||
// Clobber the $trans div and use it for status updates
|
||||
$trans.innerHTML = "<h2>transcription in progress</h2>";
|
||||
$trans.className = "status";
|
||||
$status_pro = document.createElement("progress");
|
||||
$status_pro.setAttribute("min", "0");
|
||||
$status_pro.setAttribute("max", "100");
|
||||
$status_pro.value = 0;
|
||||
$trans.appendChild($status_pro);
|
||||
|
||||
status_init = true;
|
||||
}
|
||||
if(ret.status !== "TRANSCRIBING") {
|
||||
if(ret.percent) {
|
||||
$status_pro.value = (100*ret.percent);
|
||||
}
|
||||
}
|
||||
else if(ret.percent && (status_log.length == 0 || status_log[status_log.length-1].percent+0.0001 < ret.percent)) {
|
||||
// New entry
|
||||
var $entry = document.createElement("div");
|
||||
$entry.className = "entry";
|
||||
$entry.textContent = ret.message;
|
||||
ret.$div = $entry;
|
||||
|
||||
if(ret.percent) {
|
||||
$status_pro.value = (100*ret.percent);
|
||||
}
|
||||
|
||||
if(status_log.length > 0) {
|
||||
$trans.insertBefore($entry, status_log[status_log.length-1].$div);
|
||||
}
|
||||
else {
|
||||
$trans.appendChild($entry);
|
||||
}
|
||||
status_log.push(ret);
|
||||
}
|
||||
}
|
||||
|
||||
function update() {
|
||||
if(INLINE_JSON) {
|
||||
// We want this to work from file:/// domains, so we provide a
|
||||
// mechanism for inlining the alignment data.
|
||||
render(INLINE_JSON);
|
||||
// show_downloads();
|
||||
}
|
||||
else {
|
||||
// Show the status
|
||||
get_json('status.json', function(ret) {
|
||||
$a.style.visibility = 'hidden';
|
||||
if (ret.status == 'ERROR') {
|
||||
$preloader.style.visibility = 'hidden';
|
||||
$trans.innerHTML = '<b>' + ret.status + ': ' + ret.error + '</b>';
|
||||
} else if (ret.status == 'TRANSCRIBING' || ret.status == 'ALIGNING') {
|
||||
$preloader.style.visibility = 'visible';
|
||||
render_status(ret);
|
||||
setTimeout(update, 2000);
|
||||
} else if (ret.status == 'OK') {
|
||||
// show_downloads();
|
||||
$preloader.style.visibility = 'hidden';
|
||||
// XXX: should we fetch the align.json?
|
||||
// window.location.reload();
|
||||
$a.style.visibility = 'visible';
|
||||
render(ret);
|
||||
} else if (ret.status == 'ENCODING' || ret.status == 'STARTED') {
|
||||
$preloader.style.visibility = 'visible';
|
||||
$trans.innerHTML = 'Encoding, please wait...';
|
||||
setTimeout(update, 2000);
|
||||
} else {
|
||||
console.log("unknown status", ret);
|
||||
$preloader.style.visibility = 'hidden';
|
||||
$trans.innerHTML = ret.status + '...';
|
||||
setTimeout(update, 5000);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
var INLINE_JSON;
|
||||
|
||||
update();
|
||||
|
||||
</script></body></html>
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 2.7 KiB |
|
|
@ -8,12 +8,11 @@ import typer
|
|||
# import rpyc
|
||||
|
||||
# from tqdm import tqdm
|
||||
# from pydub import AudioSegment
|
||||
# from pydub.silence import split_on_silence
|
||||
from plume.utils import lazy_module, lazy_callable
|
||||
|
||||
rpyc = lazy_module('rpyc')
|
||||
AudioSegment = lazy_callable('pydub.AudioSegment')
|
||||
pydub = lazy_module('pydub')
|
||||
split_on_silence = lazy_callable('pydub.silence.split_on_silence')
|
||||
|
||||
app = typer.Typer()
|
||||
|
|
@ -106,7 +105,7 @@ def triton_transcribe_grpc_gen(
|
|||
# ]
|
||||
# pass
|
||||
transcript_list = []
|
||||
sil_pad = AudioSegment.silent(duration=sil_msec)
|
||||
sil_pad = pydub.AudioSegment.silent(duration=sil_msec)
|
||||
for seg in chunks:
|
||||
t_seg = sil_pad + seg + sil_pad
|
||||
c_transcript = transcriber(t_seg)
|
||||
|
|
@ -124,9 +123,7 @@ def triton_transcribe_grpc_gen(
|
|||
|
||||
@app.command()
|
||||
def file(audio_file: Path, write_file: bool = False, chunked=True):
|
||||
from pydub import AudioSegment
|
||||
|
||||
aseg = AudioSegment.from_file(audio_file)
|
||||
aseg = pydub.AudioSegment.from_file(audio_file)
|
||||
transcriber, prep = triton_transcribe_grpc_gen()
|
||||
transcription = transcriber(prep(aseg))
|
||||
|
||||
|
|
@ -139,10 +136,8 @@ def file(audio_file: Path, write_file: bool = False, chunked=True):
|
|||
|
||||
@app.command()
|
||||
def benchmark(audio_file: Path):
|
||||
from pydub import AudioSegment
|
||||
|
||||
transcriber, audio_prep = transcribe_rpyc_gen()
|
||||
file_seg = AudioSegment.from_file(audio_file)
|
||||
file_seg = pydub.AudioSegment.from_file(audio_file)
|
||||
aud_seg = audio_prep(file_seg)
|
||||
|
||||
def timeinfo():
|
||||
|
|
|
|||
|
|
@ -27,6 +27,10 @@ class GoogleTTS(object):
|
|||
audio_encoding=texttospeech.enums.AudioEncoding.LINEAR16,
|
||||
sample_rate_hertz=params["sample_rate"],
|
||||
)
|
||||
if 'speaking_rate' in params:
|
||||
audio_config.speaking_rate = params['speaking_rate']
|
||||
if 'pitch' in params:
|
||||
audio_config.pitch = params['pitch']
|
||||
response = self.client.synthesize_speech(tts_input, voice, audio_config)
|
||||
audio_content = response.audio_content
|
||||
return audio_content
|
||||
|
|
@ -74,6 +78,19 @@ class GoogleTTS(object):
|
|||
)
|
||||
return results
|
||||
|
||||
@classmethod
|
||||
def voice_by_name(cls, name):
|
||||
"""Lists the available voices."""
|
||||
|
||||
# client = cls().client
|
||||
|
||||
# Performs the list voices request
|
||||
results = cls.voice_list()
|
||||
for voice in results:
|
||||
if voice['name'] == name:
|
||||
return voice
|
||||
raise ValueError(f'{name} not a valid voice')
|
||||
|
||||
|
||||
@app.command()
|
||||
def generate_audio_file(text, dest_path: Path = "./tts_audio.wav", voice="en-US-Wavenet-D"):
|
||||
|
|
|
|||
|
|
@ -0,0 +1,85 @@
|
|||
from plume.utils import ExtendedPath, get_mongo_conn
|
||||
from plume.utils.st_rerun import rerun
|
||||
from uuid import uuid4
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def setup_file_state(st):
|
||||
if not hasattr(st, "state_lock"):
|
||||
# st.task_id = str(uuid4())
|
||||
task_path = ExtendedPath("preview.lck")
|
||||
|
||||
def current_cursor_fn():
|
||||
return task_path.read_json()["current_cursor"]
|
||||
|
||||
def update_cursor_fn(val=0):
|
||||
task_path.write_json({"current_cursor": val})
|
||||
rerun()
|
||||
|
||||
st.get_current_cursor = current_cursor_fn
|
||||
st.update_cursor = update_cursor_fn
|
||||
st.state_lock = True
|
||||
# cursor_obj = mongo_conn.find_one({"type": "current_cursor", "task_id": st.task_id})
|
||||
# if not cursor_obj:
|
||||
update_cursor_fn(0)
|
||||
|
||||
|
||||
def setup_mongo_asr_validation_state(st):
|
||||
if not hasattr(st, "mongo_connected"):
|
||||
st.mongoclient = get_mongo_conn(col="asr_validation")
|
||||
mongo_conn = st.mongoclient
|
||||
st.task_id = str(uuid4())
|
||||
|
||||
def current_cursor_fn():
|
||||
# mongo_conn = st.mongoclient
|
||||
cursor_obj = mongo_conn.find_one(
|
||||
{"type": "current_cursor", "task_id": st.task_id}
|
||||
)
|
||||
cursor_val = cursor_obj["cursor"]
|
||||
return cursor_val
|
||||
|
||||
def update_cursor_fn(val=0):
|
||||
mongo_conn.find_one_and_update(
|
||||
{"type": "current_cursor", "task_id": st.task_id},
|
||||
{
|
||||
"$set": {
|
||||
"type": "current_cursor",
|
||||
"task_id": st.task_id,
|
||||
"cursor": val,
|
||||
}
|
||||
},
|
||||
upsert=True,
|
||||
)
|
||||
rerun()
|
||||
|
||||
def get_correction_entry_fn(code):
|
||||
return mongo_conn.find_one(
|
||||
{"type": "correction", "code": code}, projection={"_id": False}
|
||||
)
|
||||
|
||||
def update_entry_fn(code, value):
|
||||
mongo_conn.find_one_and_update(
|
||||
{"type": "correction", "code": code},
|
||||
{"$set": {"value": value, "task_id": st.task_id}},
|
||||
upsert=True,
|
||||
)
|
||||
|
||||
def set_task_fn(data_path, task_id):
|
||||
if task_id:
|
||||
st.task_id = task_id
|
||||
task_path = data_path / Path(f"task-{st.task_id}.lck")
|
||||
if not task_path.exists():
|
||||
print(f"creating task lock at {task_path}")
|
||||
task_path.touch()
|
||||
|
||||
st.get_current_cursor = current_cursor_fn
|
||||
st.update_cursor = update_cursor_fn
|
||||
st.get_correction_entry = get_correction_entry_fn
|
||||
st.update_entry = update_entry_fn
|
||||
st.set_task = set_task_fn
|
||||
st.mongo_connected = True
|
||||
cursor_obj = mongo_conn.find_one(
|
||||
{"type": "current_cursor", "task_id": st.task_id}
|
||||
)
|
||||
if not cursor_obj:
|
||||
update_cursor_fn(0)
|
||||
|
|
@ -0,0 +1,205 @@
|
|||
import logging
|
||||
import asyncio
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import webrtcvad
|
||||
import pydub
|
||||
from pydub.playback import play
|
||||
from pydub.utils import make_chunks
|
||||
|
||||
|
||||
DEFAULT_CHUNK_DUR = 20
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def is_frame_voice(vad, seg, chunk_dur):
|
||||
return (
|
||||
True
|
||||
if (
|
||||
seg.duration_seconds == chunk_dur / 1000
|
||||
and vad.is_speech(seg.raw_data, seg.frame_rate)
|
||||
)
|
||||
else False
|
||||
)
|
||||
|
||||
|
||||
class VADFilterAudio(object):
|
||||
"""docstring for VADFilterAudio."""
|
||||
|
||||
def __init__(self, chunk_dur=DEFAULT_CHUNK_DUR):
|
||||
super(VADFilterAudio, self).__init__()
|
||||
self.chunk_dur = chunk_dur
|
||||
self.vad = webrtcvad.Vad()
|
||||
|
||||
def filter_segment(self, wav_seg):
|
||||
chunks = make_chunks(wav_seg, self.chunk_dur)
|
||||
speech_buffer = b""
|
||||
|
||||
for i, c in enumerate(chunks[:-1]):
|
||||
voice_frame = is_frame_voice(self.vad, c, self.chunk_dur)
|
||||
if voice_frame:
|
||||
speech_buffer += c.raw_data
|
||||
filtered_seg = pydub.AudioSegment(
|
||||
data=speech_buffer,
|
||||
frame_rate=wav_seg.frame_rate,
|
||||
channels=wav_seg.channels,
|
||||
sample_width=wav_seg.sample_width,
|
||||
)
|
||||
return filtered_seg
|
||||
|
||||
|
||||
class VADUtterance(object):
|
||||
"""docstring for VADUtterance."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_silence=500,
|
||||
min_utterance=280,
|
||||
max_utterance=20000,
|
||||
chunk_dur=DEFAULT_CHUNK_DUR,
|
||||
start_cycles=3,
|
||||
):
|
||||
super(VADUtterance, self).__init__()
|
||||
self.vad = webrtcvad.Vad()
|
||||
self.chunk_dur = chunk_dur
|
||||
# duration in millisecs
|
||||
self.max_sil = max_silence
|
||||
self.min_utt = min_utterance
|
||||
self.max_utt = max_utterance
|
||||
self.speech_start = start_cycles * chunk_dur
|
||||
|
||||
def __repr__(self):
|
||||
return f"VAD(max_silence={self.max_sil},min_utterance:{self.min_utt},max_utterance:{self.max_utt})"
|
||||
|
||||
async def stream_utterance(self, audio_stream):
|
||||
silence_buffer = pydub.AudioSegment.empty()
|
||||
voice_buffer = pydub.AudioSegment.empty()
|
||||
silence_threshold = False
|
||||
async for c in audio_stream:
|
||||
voice_frame = is_frame_voice(self.vad, c, self.chunk_dur)
|
||||
logger.debug(f"is audio stream voice? {voice_frame}")
|
||||
if voice_frame:
|
||||
silence_threshold = False
|
||||
voice_buffer += c
|
||||
silence_buffer = pydub.AudioSegment.empty()
|
||||
else:
|
||||
silence_buffer += c
|
||||
voc_dur = voice_buffer.duration_seconds * 1000
|
||||
sil_dur = silence_buffer.duration_seconds * 1000
|
||||
|
||||
if voc_dur >= self.max_utt:
|
||||
logger.info(
|
||||
f"detected voice overflow: voice duration {voice_buffer.duration_seconds}"
|
||||
)
|
||||
yield voice_buffer
|
||||
voice_buffer = pydub.AudioSegment.empty()
|
||||
|
||||
if sil_dur >= self.max_sil:
|
||||
if voc_dur >= self.min_utt:
|
||||
logger.info(
|
||||
f"detected silence: voice duration {voice_buffer.duration_seconds}"
|
||||
)
|
||||
yield voice_buffer
|
||||
voice_buffer = pydub.AudioSegment.empty()
|
||||
# ignore/clear voice if silence reached threshold or indent the statement
|
||||
if not silence_threshold:
|
||||
silence_threshold = True
|
||||
|
||||
if voice_buffer:
|
||||
yield voice_buffer
|
||||
|
||||
async def stream_events(self, audio_stream):
|
||||
"""
|
||||
yields 0, voice_buffer for SpeechBuffer
|
||||
yields 1, None for StartedSpeaking
|
||||
yields 2, None for StoppedSpeaking
|
||||
yields 4, audio_stream
|
||||
"""
|
||||
silence_buffer = pydub.AudioSegment.empty()
|
||||
voice_buffer = pydub.AudioSegment.empty()
|
||||
silence_threshold, started_speaking = False, False
|
||||
async for c in audio_stream:
|
||||
# yield (4, c)
|
||||
voice_frame = is_frame_voice(self.vad, c, self.chunk_dur)
|
||||
logger.debug(f"is audio stream voice? {voice_frame}")
|
||||
if voice_frame:
|
||||
silence_threshold = False
|
||||
voice_buffer += c
|
||||
silence_buffer = pydub.AudioSegment.empty()
|
||||
else:
|
||||
silence_buffer += c
|
||||
voc_dur = voice_buffer.duration_seconds * 1000
|
||||
sil_dur = silence_buffer.duration_seconds * 1000
|
||||
|
||||
if voc_dur >= self.speech_start and not started_speaking:
|
||||
started_speaking = True
|
||||
yield (1, None)
|
||||
|
||||
if voc_dur >= self.max_utt:
|
||||
logger.info(
|
||||
f"detected voice overflow: voice duration {voice_buffer.duration_seconds}"
|
||||
)
|
||||
yield (0, voice_buffer)
|
||||
voice_buffer = pydub.AudioSegment.empty()
|
||||
started_speaking = False
|
||||
|
||||
if sil_dur >= self.max_sil:
|
||||
if voc_dur >= self.min_utt:
|
||||
logger.info(
|
||||
f"detected silence: voice duration {voice_buffer.duration_seconds}"
|
||||
)
|
||||
yield (0, voice_buffer)
|
||||
voice_buffer = pydub.AudioSegment.empty()
|
||||
started_speaking = False
|
||||
# ignore/clear voice if silence reached threshold or indent the statement
|
||||
if not silence_threshold:
|
||||
silence_threshold = True
|
||||
yield (2, None)
|
||||
|
||||
if voice_buffer:
|
||||
yield (0, voice_buffer)
|
||||
|
||||
@classmethod
|
||||
async def stream_utterance_file(cls, audio_file):
|
||||
async def stream_gen():
|
||||
audio_seg = pydub.AudioSegment.from_file(audio_file).set_frame_rate(32000)
|
||||
chunks = make_chunks(audio_seg, DEFAULT_CHUNK_DUR)
|
||||
for c in chunks:
|
||||
yield c
|
||||
|
||||
va_ut = cls()
|
||||
buffer_src = va_ut.stream_utterance(stream_gen())
|
||||
async for buf in buffer_src:
|
||||
play(buf)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
|
||||
class VADStreamGen(object):
|
||||
"""docstring for VADStreamGen."""
|
||||
|
||||
def __init__(self, arg):
|
||||
super(VADStreamGen, self).__init__()
|
||||
self.arg = arg
|
||||
|
||||
|
||||
def main():
|
||||
prog = Path(__file__).stem
|
||||
parser = argparse.ArgumentParser(prog=prog, description="transcribes audio file")
|
||||
parser.add_argument(
|
||||
"--audio_file",
|
||||
type=argparse.FileType("rb"),
|
||||
help="audio file to transcribe",
|
||||
default="./test_utter2.wav",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.run_until_complete(VADUtterance.stream_utterance_file(args.audio_file))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue