1
0
mirror of https://github.com/malarinv/plume-asr.git synced 2026-03-07 20:02:34 +00:00

1. refactor package root to src/ layout

2. add framwork suffix for models
3. change black max columns to 79
4. add tests
5. integrate vad, encrypt and  refactor manifest, regentity, extended_path, audio, parallel utils
6. added ui utils for encrypted preview
7. wip marblenet model
8. added transformers based wav2vec2 inference
9. update readme and manifest
10. add deploy setup target
This commit is contained in:
2021-06-02 18:47:44 +05:30
parent c474aa5f5a
commit e07c7c9caf
76 changed files with 4096 additions and 474 deletions

View File

@@ -0,0 +1,317 @@
import re
def entity_replacer_keeper(pre_rules=[], entity_rules=[], post_rules=[]):
# def replacer_keeper_gen():
pre_rules_c = [(re.compile(k), v) for (k, v) in pre_rules]
entity_rules_c = [(re.compile(k, re.IGNORECASE), v) for (k, v) in entity_rules]
post_rules_c = [(re.compile(k), v) for (k, v) in post_rules]
re_rules = pre_rules_c + entity_rules_c + post_rules_c
def replacer(w2v_out):
out = w2v_out
for (k, v) in re_rules:
out = k.sub(v, out)
return out
def merge_intervals(intervals):
# https://codereview.stackexchange.com/a/69249
sorted_by_lower_bound = sorted(intervals, key=lambda tup: tup[0])
merged = []
for higher in sorted_by_lower_bound:
if not merged:
merged.append(higher)
else:
lower = merged[-1]
# test for intersection between lower and higher:
# we know via sorting that lower[0] <= higher[0]
if higher[0] <= lower[1]:
upper_bound = max(lower[1], higher[1])
merged[-1] = (
lower[0],
upper_bound,
) # replace by merged interval
else:
merged.append(higher)
return merged
# merging interval tree for optimal # https://www.geeksforgeeks.org/interval-tree/
def keep_literals(w2v_out):
# out = re.sub(r"[ ;,.]", " ", w2v_out).strip()
out = w2v_out
for (k, v) in pre_rules_c:
out = k.sub(v, out)
num_spans = []
for (k, v) in entity_rules_c: # [94:]:
matches = k.finditer(out)
for m in matches:
# num_spans.append((k, m.span()))
num_spans.append(m.span())
# out = re.sub(k, v, out)
merged = merge_intervals(num_spans)
num_ents = len(merged)
keep_out = " ".join((out[s[0] : s[1]] for s in merged))
for (k, v) in post_rules_c:
keep_out = k.sub(v, keep_out)
return keep_out, num_ents
return replacer, keep_literals
def default_num_only_rules(num_range):
entity_rules = (
[
("\\bninety-nine\\b", "99"),
("\\bninety-eight\\b", "98"),
("\\bninety-seven\\b", "97"),
("\\bninety-six\\b", "96"),
("\\bninety-five\\b", "95"),
("\\bninety-four\\b", "94"),
("\\bninety-three\\b", "93"),
("\\bninety-two\\b", "92"),
("\\bninety-one\\b", "91"),
("\\bninety\\b", "90"),
("\\beighty-nine\\b", "89"),
("\\beighty-eight\\b", "88"),
("\\beighty-seven\\b", "87"),
("\\beighty-six\\b", "86"),
("\\beighty-five\\b", "85"),
("\\beighty-four\\b", "84"),
("\\beighty-three\\b", "83"),
("\\beighty-two\\b", "82"),
("\\beighty-one\\b", "81"),
("\\beighty\\b", "80"),
("\\bseventy-nine\\b", "79"),
("\\bseventy-eight\\b", "78"),
("\\bseventy-seven\\b", "77"),
("\\bseventy-six\\b", "76"),
("\\bseventy-five\\b", "75"),
("\\bseventy-four\\b", "74"),
("\\bseventy-three\\b", "73"),
("\\bseventy-two\\b", "72"),
("\\bseventy-one\\b", "71"),
("\\bseventy\\b", "70"),
("\\bsixty-nine\\b", "69"),
("\\bsixty-eight\\b", "68"),
("\\bsixty-seven\\b", "67"),
("\\bsixty-six\\b", "66"),
("\\bsixty-five\\b", "65"),
("\\bsixty-four\\b", "64"),
("\\bsixty-three\\b", "63"),
("\\bsixty-two\\b", "62"),
("\\bsixty-one\\b", "61"),
("\\bsixty\\b", "60"),
("\\bfifty-nine\\b", "59"),
("\\bfifty-eight\\b", "58"),
("\\bfifty-seven\\b", "57"),
("\\bfifty-six\\b", "56"),
("\\bfifty-five\\b", "55"),
("\\bfifty-four\\b", "54"),
("\\bfifty-three\\b", "53"),
("\\bfifty-two\\b", "52"),
("\\bfifty-one\\b", "51"),
("\\bfifty\\b", "50"),
("\\bforty-nine\\b", "49"),
("\\bforty-eight\\b", "48"),
("\\bforty-seven\\b", "47"),
("\\bforty-six\\b", "46"),
("\\bforty-five\\b", "45"),
("\\bforty-four\\b", "44"),
("\\bforty-three\\b", "43"),
("\\bforty-two\\b", "42"),
("\\bforty-one\\b", "41"),
("\\bforty\\b", "40"),
("\\bthirty-nine\\b", "39"),
("\\bthirty-eight\\b", "38"),
("\\bthirty-seven\\b", "37"),
("\\bthirty-six\\b", "36"),
("\\bthirty-five\\b", "35"),
("\\bthirty-four\\b", "34"),
("\\bthirty-three\\b", "33"),
("\\bthirty-two\\b", "32"),
("\\bthirty-one\\b", "31"),
("\\bthirty\\b", "30"),
("\\btwenty-nine\\b", "29"),
("\\btwenty-eight\\b", "28"),
("\\btwenty-seven\\b", "27"),
("\\btwenty-six\\b", "26"),
("\\btwenty-five\\b", "25"),
("\\btwenty-four\\b", "24"),
("\\btwenty-three\\b", "23"),
("\\btwenty-two\\b", "22"),
("\\btwenty-one\\b", "21"),
("\\btwenty\\b", "20"),
("\\bnineteen\\b", "19"),
("\\beighteen\\b", "18"),
("\\bseventeen\\b", "17"),
("\\bsixteen\\b", "16"),
("\\bfifteen\\b", "15"),
("\\bfourteen\\b", "14"),
("\\bthirteen\\b", "13"),
("\\btwelve\\b", "12"),
("\\beleven\\b", "11"),
("\\bten\\b", "10"),
("\\bnine\\b", "9"),
("\\beight\\b", "8"),
("\\bseven\\b", "7"),
("\\bsix\\b", "6"),
("\\bfive\\b", "5"),
("\\bfour\\b", "4"),
("\\bthree\\b", "3"),
("\\btwo\\b", "2"),
("\\bone\\b", "1"),
("\\bzero\\b", "0"),
]
+ [
(
r"\b" + str(i) + r"\b",
str(i),
)
for i in reversed(range(10))
]
+ [
(r"\bhundred\b", "00"),
]
)
return entity_rules
def default_num_rules(num_range):
entity_rules = default_num_only_rules(num_range) + [
(r"\boh\b", " 0 "),
(r"\bo\b", " 0 "),
(r"\bdouble(?: |-)(\w+|\d+)\b", "\\1 \\1"),
(r"\btriple(?: |-)(\w+|\d+)\b", "\\1 \\1 \\1"),
]
return entity_rules
def default_alnum_rules(num_range, oh_is_zero):
oh_is_zero_rules = [
(r"\boh\b", "0"),
(r"\bo\b", "0"),
]
entity_rules = (
default_num_only_rules(num_range)
+ (oh_is_zero_rules if oh_is_zero else [(r"\boh\b", "o")])
+ [
(r"\bdouble(?: |-)(\w+|\d+)\b", "\\1 \\1"),
(r"\btriple(?: |-)(\w+|\d+)\b", "\\1 \\1 \\1"),
(r"\b([a-zA-Z])\b", "\\1"),
]
)
return entity_rules
def num_replacer(num_range=100, condense=True):
entity_rules = default_num_rules(num_range)
post_rules = [(r"[^0-9]", "")] if condense else []
# post_rules = []
replacer, keeper = entity_replacer_keeper(
entity_rules=entity_rules, post_rules=post_rules
)
return replacer
def num_keeper(num_range=100):
entity_rules = default_num_rules(num_range)
pre_rules = [(r"[ ;,.]", " ")]
post_rules = []
replacer, keeper = entity_replacer_keeper(
pre_rules=pre_rules, entity_rules=entity_rules, post_rules=post_rules
)
return keeper
def alnum_replacer(num_range=100, oh_is_zero=False, condense=True):
entity_rules = default_alnum_rules(num_range, oh_is_zero)
# entity_rules = default_num_rules(num_range)
pre_rules = [(r"[ ;,.]", " "), (r"[']", "")]
def upper_case(match_obj):
char_elem = match_obj.group(0)
return char_elem.upper()
post_rules = (
[
# (r"\b[a-zA-Z]+\'[a-zA-Z]+\b", ""),
(r"\b[a-zA-Z]{2,}\b", ""),
(r"[^a-zA-Z0-9]", ""),
(r"([a-z].*)", upper_case),
]
if condense
else []
)
replacer, keeper = entity_replacer_keeper(
pre_rules=pre_rules, entity_rules=entity_rules, post_rules=post_rules
)
return replacer
def alnum_keeper(num_range=100, oh_is_zero=False):
entity_rules = default_alnum_rules(num_range, oh_is_zero)
pre_rules = [(r"[ ;,.]", " "), (r"[']", "")]
post_rules = []
replacer, keeper = entity_replacer_keeper(
pre_rules=pre_rules, entity_rules=entity_rules, post_rules=post_rules
)
return keeper
def test_num():
num_extractor = num_replacer()
keeper = num_keeper()
num_only_replacer = num_replacer(condense=False)
assert num_extractor("thirty-two") == "32"
assert num_extractor("not thirty-two fifty-nine") == "3259"
assert num_extractor(" triPle 5 fifty 3") == "555503"
assert num_only_replacer(" triPle 5 fifty 3") == " 5 5 5 50 3"
assert num_extractor("douBle 2 130") == "22130"
assert num_extractor("It is a One fifty eIght 5 fifty ") == "1508550"
assert (
num_only_replacer(" It is a One fifty eIght 5 fifty ")
== " It is a 1 50 8 5 50 "
)
assert num_extractor("One fifty-eight 5 oh o fifty") == "15850050"
assert keeper(
"my phone number is One hundred fifty-eight not 5 oh o fifty more"
) == ("One hundred fifty-eight 5 oh o fifty", 7)
def test_alnum():
extractor_oh = alnum_replacer(oh_is_zero=True)
extractor = alnum_replacer()
keeper = alnum_keeper()
only_replacer = alnum_replacer(condense=False)
assert extractor("I'm thirty-two") == "32"
assert extractor("a thirty-two") == "A32"
assert extractor("not a b thirty-two fifty-nine") == "AB3259"
assert extractor(" triPle 5 fifty 3") == "555503"
assert only_replacer(" triPle 5 fifty 3") == " 5 5 5 50 3"
assert extractor("douBle 2 130") == "22130"
assert extractor("It is a One b fifty eIght A Z 5 fifty ") == "A1B508AZ550"
assert (
only_replacer(" It's a ; One b fifty eIght A Z 5 fifty ")
== " Its a 1 b 50 8 A Z 5 50 "
)
assert (
only_replacer(" I'm is a One b fifty eIght A Z 5 fifty ")
== " Im is a 1 b 50 8 A Z 5 50 "
)
assert extractor("One Z fifty-eight 5 oh o b fifty") == "1Z585OOB50"
assert extractor_oh("One Z fifty-eight 5 oh o b fifty") == "1Z58500B50"
assert keeper(
"I'll phone number One hundred n fifty-eight not 5 oh o fifty A B more"
) == ("One hundred n fifty-eight 5 oh o fifty A B", 10)
assert keeper("I'm One hundred n fifty-eight not 5 oh o fifty A B more") == (
"One hundred n fifty-eight 5 oh o fifty A B",
10,
)
assert keeper("I am One hundred n fifty-eight not 5 oh o fifty A B more") == (
"I One hundred n fifty-eight 5 oh o fifty A B",
11,
)

105
tests/plume/test_utils.py Normal file
View File

@@ -0,0 +1,105 @@
from plume.utils import (
num_replacer,
num_keeper,
alnum_replacer,
alnum_keeper,
random_segs,
)
import numpy
import random as rand
import pytest
def test_num_replacer_keeper():
num_extractor = num_replacer()
num_only_replacer = num_replacer(condense=False)
assert num_extractor("thirty-two") == "32"
assert num_extractor("not thirty-two fifty-nine") == "3259"
assert num_extractor(" triPle 5 fifty 3") == "555503"
assert num_only_replacer(" triPle 5 fifty 3") == " 5 5 5 50 3"
assert num_extractor("douBle 2 130") == "22130"
assert num_extractor("It is a One fifty eIght 5 fifty ") == "1508550"
assert (
num_only_replacer(" It is a One fifty eIght 5 fifty ")
== " It is a 1 50 8 5 50 "
)
assert num_extractor("One fifty-eight 5 oh o fifty") == "15850050"
keeper = num_keeper()
assert keeper(
"my phone number is One hundred fifty-eight not 5 oh o fifty more"
) == ("One hundred fifty-eight 5 oh o fifty", 7)
def test_alnum_replacer():
extractor_oh = alnum_replacer(oh_is_zero=True)
extractor = alnum_replacer()
only_replacer = alnum_replacer(condense=False)
assert extractor("5 oh i c 3") == "5OIC3"
assert extractor("I am, oh it is 3. I will") == "3"
assert extractor("I oh o 3") == "IOO3"
assert extractor("I will 3 I") == "3I"
assert extractor("I'm thirty-two") == "32"
assert extractor("I am thirty-two") == "32"
assert extractor("I j thirty-two") == "IJ32"
assert extractor("a thirty-two") == "A32"
assert extractor("not a b thirty-two fifty-nine") == "AB3259"
assert extractor(" triPle 5 fifty 3") == "555503"
assert only_replacer(" triPle 5 fifty 3") == " 5 5 5 50 3"
assert extractor("douBle 2 130") == "22130"
assert extractor("It is a One b fifty eIght A Z 5 fifty ") == "A1B508AZ550"
assert (
only_replacer(" It's a ; One b fifty eIght A Z 5 fifty ")
== " Its a 1 b 50 8 A Z 5 50 "
)
assert (
only_replacer(" I'm is a One b fifty eIght A Z 5 fifty ")
== " Im is a 1 b 50 8 A Z 5 50 "
)
assert extractor("One Z fifty-eight 5 oh o b fifty") == "1Z585OOB50"
assert extractor_oh("One Z fifty-eight 5 oh o b fifty") == "1Z58500B50"
assert (
extractor("I One hundred n fifty-eight not 5 oh o fifty A B more")
== "I100N585OO50AB"
)
def test_alnum_keeper():
keeper = alnum_keeper()
assert keeper("I One hundred n fifty-eight not 5 oh o fifty A B more") == (
"I One hundred n fifty-eight 5 oh o fifty A B",
11,
)
assert keeper(
"I'll phone number One hundred n fifty-eight not 5 oh o fifty A B more"
) == ("One hundred n fifty-eight 5 oh o fifty A B", 10)
assert keeper(
"I'm One hundred n fifty-eight not 5 oh o fifty A B more"
) == (
"One hundred n fifty-eight 5 oh o fifty A B",
10,
)
assert keeper(
"I am One hundred n fifty-eight not 5 oh o fifty A B more"
) == (
"One hundred n fifty-eight 5 oh o fifty A B",
10,
)
@pytest.fixture
def random():
rand.seed(0)
numpy.random.seed(0)
def test_random_segs(random):
segs = random_segs(100000, 1000, 3000)
def segs_comply(segs, min, max):
for (start, end) in segs:
if end - start < min or end - start > max:
return False
return True
assert segs_comply(segs, 1000, 3000) == True

View File

@@ -0,0 +1,17 @@
from plume.utils.regentity import infer_num_replacer
def test_infer_num():
repl = infer_num_replacer()
assert (
repl(
"SIX NINE TRIPL EIGHT SIX SIX DOULE NINE THREE ZERO TWO SEVENT-ONE"
)
== "69888669930271"
)
assert (
repl("SIX NINE FSIX EIGHT IGSIX SIX NINE NINE THRE ZERO TWO SEVEN ONE")
== "6968669930271"
)