from __future__ import annotations
from pycantonese.jyutping.parse_jyutping import parse_jyutping
ONSETS_TIPA = {
"b": "p",
"d": "t",
"g": "k",
"gw": "k\\super w ",
"z": "ts",
"p": "p\\super h ",
"t": "t\\super h ",
"k": "k\\super h ",
"kw": "k\\super w\\super h ",
"c": "ts\\super h ",
"m": "m",
"n": "n",
"ng": "N",
"f": "f",
"h": "h",
"s": "s",
"l": "l",
"w": "w",
"j": "j",
"v": "v",
"": "",
}
FINALS_TIPA = {
"i": "i",
"ip": "ip\\textcorner ",
"it": "it\\textcorner ",
"ik": "Ik\\textcorner ",
"im": "im",
"in": "in",
"ing": "IN",
"iu": "iu",
"yu": "y",
"yut": "yt\\textcorner ",
"yun": "yn",
"u": "u",
"ut": "ut\\textcorner ",
"uk": "Uk\\textcorner ",
"un": "un",
"ung": "UN",
"ui": "uY",
"e": "E",
"ek": "Ek\\textcorner ",
"eng": "EN",
"ei": "eI",
"eot": "8t\\textcorner ",
"eon": "8n",
"eoi": "8Y",
"oe": "\\oe ",
"oek": "\\oe k\\textcorner ",
"oeng": "\\oe N",
"o": "O",
"ot": "Ot\\textcorner ",
"ok": "Ok\\textcorner ",
"on": "On",
"ong": "ON",
"oi": "OY",
"ou": "ou",
"ap": "5p\\textcorner ",
"at": "5t\\textcorner ",
"ak": "5k\\textcorner ",
"am": "5m",
"an": "5n",
"ang": "5N",
"ai": "5I",
"au": "5u",
"aa": "a",
"aap": "ap\\textcorner ",
"aat": "at\\textcorner ",
"aak": "ak\\textcorner ",
"aam": "am",
"aan": "an",
"aang": "aN",
"aai": "aI",
"aau": "au",
"m": "\\s{m}",
"ng": "\\s{N}",
}
TONES_TIPA = {
"1": "55",
"2": "25",
"3": "33",
"4": "21",
"5": "23",
"6": "22",
}
[docs]
def jyutping_to_tipa(jp: str | list[str]) -> list[str]:
"""Convert Jyutping romanization into LaTeX TIPA.
Args:
jp (str or list[str]): A Jyutping romanization string for a single
word (any number of syllables, optionally separated by spaces),
or a list of such strings carrying explicit word segmentation
(one word per element).
Returns:
list[str]: A list with one element per input word. Each element is
the TIPA representation of that word, with syllables separated by a
single space.
Raises:
ValueError: If the Jyutping romanization is illegal (e.g., with
unrecognized elements).
Examples:
>>> jyutping_to_tipa("gwong2dung1waa2") # 廣東話, Cantonese # doctest: +SKIP
['k\\super w ON25 tUN55 wa25']
""" # noqa: E501
if not jp:
return []
words = [jp] if isinstance(jp, str) else jp
return [" ".join(_word_to_tipa_syllables(word)) for word in words]
def _word_to_tipa_syllables(word: str) -> list[str]:
jp_parsed_list = parse_jyutping(word)
tipa_list = []
for jp_parsed in jp_parsed_list:
# TODO: Separate "final" as "nucleus" and "coda" instead?
tipa = ONSETS_TIPA[jp_parsed.onset] + FINALS_TIPA[jp_parsed.final]
tipa = tipa.strip() + TONES_TIPA[jp_parsed.tone]
tipa_list.append(tipa)
return tipa_list