78 lines
1.3 KiB
Python
78 lines
1.3 KiB
Python
import pandas as pd
|
|
import re
|
|
mapping = {s.split()[0]: s.split()[1] for s in """
|
|
AA AA
|
|
AE AE
|
|
AH UX
|
|
AO AO
|
|
AW AW
|
|
AY AY
|
|
B b
|
|
CH C
|
|
D d
|
|
DH D
|
|
EH EH
|
|
ER UXr
|
|
EY EY
|
|
F f
|
|
G g
|
|
HH h
|
|
IH IH
|
|
IY IY
|
|
JH J
|
|
K k
|
|
L l
|
|
M m
|
|
N n
|
|
NG N
|
|
OW OW
|
|
OY OY
|
|
P p
|
|
R r
|
|
S s
|
|
SH S
|
|
T t
|
|
TH T
|
|
UH UH
|
|
UW UW
|
|
V v
|
|
W w
|
|
Y y
|
|
X x
|
|
Z z
|
|
ZH Z
|
|
""".strip().split('\n')}
|
|
|
|
sim_mat = pd.read_csv('./similarity.csv',header=0,index_col=0)
|
|
|
|
def convert_ph(ph):
|
|
stress_level = re.search("(\w+)([0-9])",ph)
|
|
if stress_level:
|
|
return stress_level.group(2)+mapping[stress_level.group(1)]
|
|
else:
|
|
return mapping[ph]
|
|
|
|
def sim_mat_to_apple_table(smt):
|
|
colnames = [convert_ph(ph) for ph in smt.index.tolist()]
|
|
smt = pd.DataFrame(np.nan_to_num(smt.values))
|
|
fsmt = (smt.T+smt)
|
|
np.fill_diagonal(fsmt.values,100.0)
|
|
asmt = pd.DataFrame.copy(fsmt)
|
|
asmt.columns = colnames
|
|
asmt.index = colnames
|
|
apple_sim_lookup = asmt.stack().reset_index()
|
|
apple_sim_lookup.columns = ['q','r','s']
|
|
return apple_sim_lookup
|
|
|
|
apple_sim_lookup = sim_mat_to_apple_table(sim_mat)
|
|
|
|
def top_match(ph):
|
|
selected = apple_sim_lookup[(apple_sim_lookup.q == ph) & (apple_sim_lookup.s < 100) & (apple_sim_lookup.s >= 70)]
|
|
tm = ph
|
|
if len(selected) > 0:
|
|
tm = pd.DataFrame.sort_values(selected,'s',ascending=False).iloc[0].r
|
|
return tm
|
|
|
|
def similar_phoneme(ph_str):
|
|
return ph_str
|