Source code for pycantonese.jyutping.parse_jyutping

from __future__ import annotations

import dataclasses
import re

ONSETS = {
    "b",
    "d",
    "g",
    "gw",
    "z",
    "p",
    "t",
    "k",
    "kw",
    "c",
    "m",
    "n",
    "ng",
    "f",
    "h",
    "s",
    "l",
    "w",
    "j",
    "v",
    "",
}

NUCLEI = {"aa", "a", "i", "yu", "u", "oe", "e", "eo", "o", "m", "ng"}

CODAS = {"p", "t", "k", "m", "n", "ng", "i", "u", ""}

TONES = {"1", "2", "3", "4", "5", "6"}

# Two-char alternatives are listed before single-char ones (e.g., "aa" before "a",
# "ng" before "n"). Python's NFA regex engine tries alternatives left-to-right and
# backtracks on failure, which correctly resolves ambiguities like "m4" (onset="m"
# fails -> backtracks to nucleus="m") and "hng6" (onset="h", nucleus="ng").
_JYUTPING_SYLLABLE_RE = re.compile(
    r"(?P<onset>gw|kw|ng|[bdgzptkcmnfhslwjv])?"
    r"(?P<nucleus>aa|oe|eo|yu|ng|[aeioumn])"
    r"(?P<coda>ng|[iptkmnu])?"
    r"(?P<tone>[1-6])"
)



[docs]
@dataclasses.dataclass
class Jyutping:
    """Jyutping representation of a Chinese/Cantonese character.

    Attributes:
        onset (str): Onset
        nucleus (str): Nucleus
        coda (str): Coda
        tone (str): Tone
    """

    __slots__ = ("onset", "nucleus", "coda", "tone")
    onset: str
    nucleus: str
    coda: str
    tone: str


[docs]
    def __str__(self):
        """Combine onset + nucleus + coda + tone."""
        return f"{self.onset}{self.nucleus}{self.coda}{self.tone}"



[docs]
    def __hash__(self):
        return hash(self.__str__())


    @property
    def final(self):
        """Return the final (= nucleus + coda)."""
        return f"{self.nucleus}{self.coda}"




[docs]
def parse_jyutping(jp_str) -> list[Jyutping]:
    """Parse Jyutping romanization into onset, nucleus, coda, and tone.

    Args:
        jp_str (str): Jyutping romanization for one or multiple characters.

    Returns:
        list[Jyutping]

    Raises:
        ValueError: If the Jyutping romanization is illegal (e.g., with
            unrecognized elements).

    Examples:
        >>> parse_jyutping("gwong2 dung1 waa2")  # 廣東話, Cantonese
        [Jyutping(onset='gw', nucleus='o', coda='ng', tone='2'),
         Jyutping(onset='d', nucleus='u', coda='ng', tone='1'),
         Jyutping(onset='w', nucleus='aa', coda='', tone='2')]
    """
    if not jp_str:
        return []

    if not isinstance(jp_str, str):
        raise ValueError("argument needs to be a string -- " + repr(jp_str))
    jp_str = jp_str.lower()

    # Split into individual syllables at tone digits; whitespace between
    # syllables is tolerated and ignored.
    jp_list = []
    jp_current = ""
    last_non_ws = ""
    for c in jp_str:
        if c.isspace():
            continue
        jp_current = jp_current + c
        last_non_ws = c
        if c.isdigit():
            jp_list.append(jp_current)
            jp_current = ""

    if not last_non_ws.isdigit():
        raise ValueError("tone error -- " + repr(last_non_ws))

    jp_parsed_list = []

    for jp in jp_list:
        if len(jp) < 2:
            raise ValueError(
                "jyutping string has fewer than 2 characters -- " + repr(jp)
            )

        match = _JYUTPING_SYLLABLE_RE.fullmatch(jp)
        if match:
            jp_parsed_list.append(
                Jyutping(
                    match.group("onset") or "",
                    match.group("nucleus"),
                    match.group("coda") or "",
                    match.group("tone"),
                )
            )
        else:
            _raise_detailed_error(jp)

    return jp_parsed_list



def _raise_detailed_error(jp: str) -> None:
    """Analyze a failed Jyutping syllable and raise a descriptive ValueError."""
    tone = jp[-1]
    if tone not in TONES:
        raise ValueError("tone error -- " + repr(jp))

    cvc = jp[:-1]

    if cvc[-1] not in "ieaouptkmng":
        raise ValueError("coda error -- " + repr(jp))

    # Try to extract onset by stripping vowels from the right
    cv = cvc
    if cvc[-2:] == "ng":
        cv = cvc[:-2]
    elif cvc[-1] in "ptkmn" or cvc[-1] in "iu":
        cv = cvc[:-1]

    nucleus = ""
    while cv and cv[-1] in "ieaouy":
        nucleus = cv[-1] + nucleus
        cv = cv[:-1]

    if not nucleus:
        raise ValueError("nucleus error -- " + repr(jp))

    if cv not in ONSETS:
        raise ValueError("onset error -- " + repr(jp))

    raise ValueError("invalid jyutping -- " + repr(jp))


def _parse_final(final):
    """Parse a final into its nucleus and coda.

    Args:
        final (str): The final to parse.

    Returns:
        tuple[str]
    """
    for i in range(1, len(final) + 1):
        possible_nucleus = final[:i]
        possible_coda = final[i:]

        if (possible_nucleus in NUCLEI) and (possible_coda in CODAS):
            return possible_nucleus, possible_coda
    return None