import pandas as pd import re mapping = {s.split()[0]: s.split()[1] for s in """ AA AA AE AE AH UX AO AO AW AW AY AY B b CH C D d DH D EH EH ER UXr EY EY F f G g HH h IH IH IY IY JH J K k L l M m N n NG N OW OW OY OY P p R r S s SH S T t TH T UH UH UW UW V v W w Y y X x Z z ZH Z """.strip().split('\n')} sim_mat = pd.read_csv('./similarity.csv',header=0,index_col=0) def convert_ph(ph): stress_level = re.search("(\w+)([0-9])",ph) if stress_level: return stress_level.group(2)+mapping[stress_level.group(1)] else: return mapping[ph] def sim_mat_to_apple_table(smt): colnames = [convert_ph(ph) for ph in smt.index.tolist()] smt = pd.DataFrame(np.nan_to_num(smt.values)) fsmt = (smt.T+smt) np.fill_diagonal(fsmt.values,100.0) asmt = pd.DataFrame.copy(fsmt) asmt.columns = colnames asmt.index = colnames apple_sim_lookup = asmt.stack().reset_index() apple_sim_lookup.columns = ['q','r','s'] return apple_sim_lookup apple_sim_lookup = sim_mat_to_apple_table(sim_mat) def top_match(ph): selected = apple_sim_lookup[(apple_sim_lookup.q == ph) & (apple_sim_lookup.s < 100) & (apple_sim_lookup.s >= 70)] tm = ph if len(selected) > 0: tm = pd.DataFrame.sort_values(selected,'s',ascending=False).iloc[0].r return tm def similar_phoneme(ph_str): return ph_str