Source code for pycantonese.jyutping.parse_jyutping

from __future__ import annotations

import dataclasses
import re

ONSETS = {
    "b",
    "d",
    "g",
    "gw",
    "z",
    "p",
    "t",
    "k",
    "kw",
    "c",
    "m",
    "n",
    "ng",
    "f",
    "h",
    "s",
    "l",
    "w",
    "j",
    "v",
    "",
}

NUCLEI = {"aa", "a", "i", "yu", "u", "oe", "e", "eo", "o", "m", "ng"}

CODAS = {"p", "t", "k", "m", "n", "ng", "i", "u", ""}

TONES = {"1", "2", "3", "4", "5", "6"}

# Two-char alternatives are listed before single-char ones (e.g., "aa" before "a",
# "ng" before "n"). Python's NFA regex engine tries alternatives left-to-right and
# backtracks on failure, which correctly resolves ambiguities like "m4" (onset="m"
# fails -> backtracks to nucleus="m") and "hng6" (onset="h", nucleus="ng").
_JYUTPING_SYLLABLE_RE = re.compile(
    r"(?P<onset>gw|kw|ng|[bdgzptkcmnfhslwjv])?"
    r"(?P<nucleus>aa|oe|eo|yu|ng|[aeioumn])"
    r"(?P<coda>ng|[iptkmnu])?"
    r"(?P<tone>[1-6])"
)


[docs] @dataclasses.dataclass class Jyutping: """Jyutping representation of a Chinese/Cantonese character. Attributes: onset (str): Onset nucleus (str): Nucleus coda (str): Coda tone (str): Tone """ __slots__ = ("onset", "nucleus", "coda", "tone") onset: str nucleus: str coda: str tone: str
[docs] def __str__(self): """Combine onset + nucleus + coda + tone.""" return f"{self.onset}{self.nucleus}{self.coda}{self.tone}"
[docs] def __hash__(self): return hash(self.__str__())
@property def final(self): """Return the final (= nucleus + coda).""" return f"{self.nucleus}{self.coda}"
[docs] def parse_jyutping(jp_str) -> list[Jyutping]: """Parse Jyutping romanization into onset, nucleus, coda, and tone. Args: jp_str (str): Jyutping romanization for one or multiple characters. Returns: list[Jyutping] Raises: ValueError: If the Jyutping romanization is illegal (e.g., with unrecognized elements). Examples: >>> parse_jyutping("gwong2 dung1 waa2") # 廣東話, Cantonese [Jyutping(onset='gw', nucleus='o', coda='ng', tone='2'), Jyutping(onset='d', nucleus='u', coda='ng', tone='1'), Jyutping(onset='w', nucleus='aa', coda='', tone='2')] """ if not jp_str: return [] if not isinstance(jp_str, str): raise ValueError("argument needs to be a string -- " + repr(jp_str)) jp_str = jp_str.lower() # Split into individual syllables at tone digits; whitespace between # syllables is tolerated and ignored. jp_list = [] jp_current = "" last_non_ws = "" for c in jp_str: if c.isspace(): continue jp_current = jp_current + c last_non_ws = c if c.isdigit(): jp_list.append(jp_current) jp_current = "" if not last_non_ws.isdigit(): raise ValueError("tone error -- " + repr(last_non_ws)) jp_parsed_list = [] for jp in jp_list: if len(jp) < 2: raise ValueError( "jyutping string has fewer than 2 characters -- " + repr(jp) ) match = _JYUTPING_SYLLABLE_RE.fullmatch(jp) if match: jp_parsed_list.append( Jyutping( match.group("onset") or "", match.group("nucleus"), match.group("coda") or "", match.group("tone"), ) ) else: _raise_detailed_error(jp) return jp_parsed_list
def _raise_detailed_error(jp: str) -> None: """Analyze a failed Jyutping syllable and raise a descriptive ValueError.""" tone = jp[-1] if tone not in TONES: raise ValueError("tone error -- " + repr(jp)) cvc = jp[:-1] if cvc[-1] not in "ieaouptkmng": raise ValueError("coda error -- " + repr(jp)) # Try to extract onset by stripping vowels from the right cv = cvc if cvc[-2:] == "ng": cv = cvc[:-2] elif cvc[-1] in "ptkmn" or cvc[-1] in "iu": cv = cvc[:-1] nucleus = "" while cv and cv[-1] in "ieaouy": nucleus = cv[-1] + nucleus cv = cv[:-1] if not nucleus: raise ValueError("nucleus error -- " + repr(jp)) if cv not in ONSETS: raise ValueError("onset error -- " + repr(jp)) raise ValueError("invalid jyutping -- " + repr(jp)) def _parse_final(final): """Parse a final into its nucleus and coda. Args: final (str): The final to parse. Returns: tuple[str] """ for i in range(1, len(final) + 1): possible_nucleus = final[:i] possible_coda = final[i:] if (possible_nucleus in NUCLEI) and (possible_coda in CODAS): return possible_nucleus, possible_coda return None