1
0
mirror of https://github.com/malarinv/plume-asr.git synced 2026-03-08 04:12:35 +00:00

1. fixed nty-num type spellcheck issue

2. added tests for the same
3. remove [infer] optional subsumes [eval]
This commit is contained in:
2021-06-08 17:45:09 +05:30
parent af51fe95cb
commit 4bca2097e1
4 changed files with 42 additions and 9 deletions

View File

@@ -49,6 +49,7 @@ from .regentity import ( # noqa
default_num_only_rules,
default_alnum_rules,
entity_replacer_keeper,
vocab_corrector_gen,
)
boto3 = lazy_module("boto3")

View File

@@ -4,6 +4,8 @@ from .lazy_import import lazy_callable, lazy_module
num2words = lazy_callable("num2words.num2words")
spellchecker = lazy_module("spellchecker")
editdistance = lazy_module("editdistance")
# from num2words import num2words
@@ -363,15 +365,34 @@ def infer_num_replacer(num_range=100, condense=True):
return final_replacer
def vocab_corrector_gen(vocab):
spell = spellchecker.SpellChecker(distance=1)
def vocab_corrector_gen(vocab, distance=1, method="spell"):
spell = spellchecker.SpellChecker(distance=distance)
words_to_remove = set(spell.word_frequency.words()) - set(vocab)
spell.word_frequency.remove_words(words_to_remove)
spell.word_frequency.load_words(vocab)
def corrector(inp):
return " ".join(
[spell.correction(tok) for tok in spell.split_words(inp)]
)
if method == "spell":
def corrector(inp):
# return " ".join(
# [spell.correction(tok) for tok in spell.split_words(inp)]
# )
return " ".join(
[spell.correction(tok) for tok in inp.split()]
)
elif method == "edit":
# editdistance.eval("banana", "bahama")
def corrector(inp):
match_dists = sorted(
[(v, editdistance.eval(inp, v)) for v in vocab],
key=lambda x: x[1],
)
return match_dists[0]
else:
raise ValueError(f"unsupported method:{method}")
return corrector