Source code for pycantonese.jyutping.yale

from __future__ import annotations

import unicodedata

from pycantonese.jyutping.parse_jyutping import parse_jyutping

ONSETS_YALE = {
    "b": "b",
    "d": "d",
    "g": "g",
    "gw": "gw",
    "z": "j",
    "p": "p",
    "t": "t",
    "k": "k",
    "kw": "kw",
    "c": "ch",
    "m": "m",
    "n": "n",
    "ng": "ng",
    "f": "f",
    "h": "h",
    "s": "s",
    "l": "l",
    "w": "w",
    "j": "y",
    "v": "v",
    "": "",
}

NUCLEI_YALE = {
    "aa": "aa",
    "a": "a",
    "i": "i",
    "yu": "yu",
    "u": "u",
    "oe": "eu",
    "e": "e",
    "eo": "eu",
    "o": "o",
    "m": "m",
    "ng": "ng",
}

CODAS_YALE = {
    "p": "p",
    "t": "t",
    "k": "k",
    "m": "m",
    "n": "n",
    "ng": "ng",
    "i": "i",
    "u": "u",
    "": "",
}


_YALE_AMBIGUOUS_CONSONANTS = ("ng", "h", "p", "t", "k", "m", "n")

_YALE_VOWEL_DISPLAY_LETTERS = frozenset("aeiou" "áéíóú" "àèìòù" "āēīōū")


def _needs_apostrophe(prev_syl: str, next_syl: str) -> bool:
    """Return True iff gluing ``prev_syl`` + ``next_syl`` would produce an
    ambiguous syllable boundary -- either visually (the same heuristic used
    historically by ``jyutping_to_yale(..., return_as='string')``) or
    structurally (the joined string parses as a different syllable split)."""
    # Visual heuristic: a consonant or low-tone "h" sitting between two
    # syllables can be read as either an onset or a coda.
    ends_ambig = any(prev_syl.endswith(c) for c in _YALE_AMBIGUOUS_CONSONANTS)
    starts_vowel = bool(next_syl) and next_syl[0] in _YALE_VOWEL_DISPLAY_LETTERS
    starts_ambig = any(next_syl.startswith(c) for c in _YALE_AMBIGUOUS_CONSONANTS)
    if ends_ambig and starts_vowel:
        return True
    if not ends_ambig and starts_ambig:
        return True
    # Structural check: the visual heuristic above misses the "both ends are
    # ambiguous consonants" case (it only fires on exactly-one-ambiguous-end).
    # Concatenating without an apostrophe can let the greedy splitter re-bind
    # boundary characters into a syllable that doesn't match the original.
    #
    # Example where the structural check is *necessary*:
    #   prev_syl="yih" (Jyutping ji6, low-tone "h"), next_syl="pa".
    #   Visual: prev ends in "h" (ambig) AND next starts in "p" (ambig)
    #     -- both ambig, so neither visual case fires.
    #   But "yihpa" re-parses greedily as one syllable "yihp" + "a"
    #     (Yale convention: low-tone "h" sits BEFORE a stop coda, so
    #     y + i + h + p is a valid single syllable, Jyutping jip6),
    #     so _split_piece("yihpa") = ["yihp", "a"] != ["yih", "pa"].
    #   -> returns True -> an apostrophe gives "yih'pa".
    #
    # Example where the structural check *agrees* (no apostrophe):
    #   prev_syl="m̀h" (Jyutping m4), next_syl="gōi" (goi1).
    #   Visual: prev ends "h" but next starts "g" (not ambig, not vowel)
    #     -> no apostrophe.
    #   "m̀hgōi" re-parses cleanly as ["m̀h", "gōi"] (syllabic-nasal m̀h
    #     followed by a fresh "gōi" syllable), so structural also says no.
    nfd_prev = unicodedata.normalize("NFD", prev_syl)
    nfd_next = unicodedata.normalize("NFD", next_syl)
    try:
        return _split_piece(nfd_prev + nfd_next) != [nfd_prev, nfd_next]
    except ValueError:
        return True



[docs]
def stringify_yale(yale: list[str]) -> str:
    """Join Yale words (the output of :func:`jyutping_to_yale`) into one string.

    Words (list elements) are separated by a single space. Within each word,
    syllables are concatenated directly, with an apostrophe ``'`` inserted at
    a syllable boundary only when the boundary would otherwise be ambiguous
    (i.e., when a consonant letter or the low-tone marker ``h`` could be read
    either as the onset of the next syllable or as the coda of the previous
    one).

    Args:
        yale (list[str]): A list of Yale words, each a string of syllables
            separated by single spaces -- the shape returned by
            :func:`jyutping_to_yale`.

    Returns:
        str: The joined Yale string.

    Examples:
        >>> stringify_yale(jyutping_to_yale("gwong2dung1waa2"))  # 廣東話
        'gwóngdūngwá'
        >>> stringify_yale(jyutping_to_yale("hei3hau6"))  # 氣候
        "hei'hauh"
        >>> stringify_yale(jyutping_to_yale(["gwong2dung1", "waa2"]))
        'gwóngdūng wá'
    """
    if not yale:
        return ""
    out_words = []
    for word in yale:
        syllables = word.split()
        if not syllables:
            continue
        parts = [syllables[0]]
        for prev, nxt in zip(syllables, syllables[1:]):
            if _needs_apostrophe(prev, nxt):
                parts.append("'")
            parts.append(nxt)
        out_words.append("".join(parts))
    return " ".join(out_words)




[docs]
def jyutping_to_yale(jp: str | list[str]) -> list[str]:
    """Convert Jyutping romanization into Yale romanization.

    Args:
        jp (str or list[str]): A Jyutping romanization string for a single
            word (any number of syllables, optionally separated by spaces),
            or a list of such strings carrying explicit word segmentation
            (one word per element).

    Returns:
        list[str]: A list with one element per input word. Each element is
        the Yale romanization of that word, with syllables separated by a
        single space.

    Raises:
        ValueError: If the Jyutping romanization is illegal (e.g., with
            unrecognized elements).

    Examples:
        >>> jyutping_to_yale("gwong2dung1waa2")  # 廣東話, Cantonese
        ['gwóng dūng wá']
        >>> jyutping_to_yale(["gwong2dung1", "waa2"])
        ['gwóng dūng', 'wá']
        >>> jyutping_to_yale("hei3hau6")  # 氣候, climate
        ['hei hauh']
    """
    if not jp:
        return []
    words = [jp] if isinstance(jp, str) else jp
    return [" ".join(_word_to_yale_syllables(word)) for word in words]



def _word_to_yale_syllables(word: str) -> list[str]:
    jp_parsed_list = parse_jyutping(word)
    yale_list = []

    for jp_parsed in jp_parsed_list:
        onset = ONSETS_YALE[jp_parsed.onset]
        nucleus = NUCLEI_YALE[jp_parsed.nucleus]
        coda = CODAS_YALE[jp_parsed.coda]
        tone = jp_parsed.tone  # still in parse_jyutping

        # jyutping2yale system uses "h" to mark the three low tones
        if tone in {"4", "5", "6"}:
            low_tone_h = "h"
        else:
            low_tone_h = ""

        # in jyutping2yale, long "aa" vowel with no coda is denoted by "a"
        if nucleus == "aa" and coda == "":
            nucleus = "a"

        # when nucleus is "yu"...
        # 1. disallow "yyu" (when onset is "y")
        # 2. change nucleus "yu" into "u" -- this is a hack for adding tone
        #       diacritic, since we don't want "y" to bear the diacritic
        if nucleus == "yu":
            if onset == "y":
                onset = ""
            nucleus = "u"

        # when nucleus is "ng"
        # the tone diacritic has to be on "g" but not "n"
        # now we pretend that the nucleus is "g", and will prepend the "n" back
        # at the end
        if nucleus == "ng":
            nucleus = "g"

        # add the jyutping2yale tone diacritic to the first nucleus letter
        # parse_jyutping tone 1      --> add macron
        # parse_jyutping tone 2 or 5 --> add acute
        # parse_jyutping tone 4      --> add grave
        # parse_jyutping tone 3 or 6 --> (no diacritic)
        # If the accented letter doesn't exist in unicode, use the combining
        # accent instead.

        letter = nucleus[0]  # nucleus 1st letter
        unicode_letter_name = unicodedata.name(letter)
        if tone == "1":
            try:
                letter_with_diacritic = unicodedata.lookup(
                    unicode_letter_name + " WITH MACRON"
                )
            except KeyError:
                letter_with_diacritic = letter + "\u0304"
        elif tone in {"2", "5"}:
            try:
                letter_with_diacritic = unicodedata.lookup(
                    unicode_letter_name + " WITH ACUTE"
                )
            except KeyError:
                letter_with_diacritic = letter + "\u0301"
        elif tone == "4":
            try:
                letter_with_diacritic = unicodedata.lookup(
                    unicode_letter_name + " WITH GRAVE"
                )
            except KeyError:
                letter_with_diacritic = letter + "\u0300"
        else:
            # either tone 3 or tone 6
            letter_with_diacritic = letter
        nucleus = letter_with_diacritic + nucleus[1:]

        # add back "y" if the nucleus is "yu"
        # ("y" was taken away for convenience in adding tone diacritic)
        if jp_parsed.nucleus == "yu":
            nucleus = "y" + nucleus

        # add back "n" if the nucleus is "ng"
        # ('n' was taken away so that tone diacritic is on "g" but not "n")
        if jp_parsed.nucleus == "ng":
            nucleus = "n" + nucleus

        # parse_jyutping final "eu" should be jyutping2yale "ew" (not "eu")
        if coda == "u" and nucleus == "e":
            coda = "w"

        # save the resultant jyutping2yale
        if coda in {"i", "u", "w"} and tone in {"4", "5", "6"}:
            yale = onset + nucleus + coda + low_tone_h
        else:
            yale = onset + nucleus + low_tone_h + coda
        yale_list.append(yale)

    return yale_list


# Inverse mapping tables for Yale -> Jyutping.
# Yale onsets and codas have unique inverses once kw->kw is corrected above.
_ONSETS_JYUTPING = {v: k for k, v in ONSETS_YALE.items()}
_CODAS_JYUTPING = {v: k for k, v in CODAS_YALE.items()}

# Yale nucleus "eu" maps from both Jyutping "oe" and "eo"; resolved by coda
# at parse time (see _resolve_eu). Other Yale nuclei invert cleanly.
_NUCLEI_JYUTPING_UNAMBIGUOUS = {
    "aa": "aa",
    "a": "a",  # Jyutping "aa" with no coda is written "a" in Yale; reversed below
    "i": "i",
    "yu": "yu",
    "u": "u",
    "e": "e",
    "o": "o",
    "m": "m",
    "ng": "ng",
}

# Yale onsets ordered longest-first for greedy matching.
_YALE_ONSETS_ORDERED = (
    "ch",
    "gw",
    "kw",
    "ng",
    "b",
    "d",
    "g",
    "j",
    "p",
    "t",
    "k",
    "m",
    "n",
    "f",
    "h",
    "s",
    "l",
    "w",
    "y",
    "v",
)

# Yale nuclei ordered longest-first (base-letter form, no diacritic).
_YALE_NUCLEI_ORDERED = ("aa", "eu", "yu", "ng", "a", "e", "i", "o", "u", "m")

_YALE_VOWEL_LETTERS = set("aeiou")

# Map a base vowel letter + combining-accent character to (vowel, tone-marker).
# tone-marker: "macron" -> tone 1; "acute" -> tone 2 or 5; "grave" -> tone 4;
# None -> tone 3 or 6 (disambiguated by trailing "h").
_DIACRITIC_TO_MARK = {
    "̄": "macron",
    "́": "acute",
    "̀": "grave",
}


def _strip_diacritic(nucleus_chars):
    """Given an NFD-decomposed nucleus string (base letters + combining marks),
    return (base_nucleus_str, tone_mark) where tone_mark is one of
    {"macron", "acute", "grave", None}. Raises ValueError on unknown marks.
    """
    base = []
    mark = None
    for ch in nucleus_chars:
        if unicodedata.category(ch) == "Mn":
            found = _DIACRITIC_TO_MARK.get(ch)
            if found is None:
                raise ValueError("unrecognized diacritic in nucleus -- " + repr(ch))
            if mark is not None and mark != found:
                raise ValueError("multiple tone diacritics in one syllable")
            mark = found
        else:
            base.append(ch)
    return "".join(base), mark


def _tone_from(mark, has_h):
    if mark == "macron":
        return "1"
    if mark == "acute":
        return "5" if has_h else "2"
    if mark == "grave":
        if not has_h:
            raise ValueError("grave (tone 4) requires 'h' low-tone marker")
        return "4"
    return "6" if has_h else "3"


def _resolve_eu(coda_yale):
    """For nucleus 'eu', pick Jyutping 'oe' or 'eo' based on Yale coda."""
    if coda_yale in {"n", "t", "i"}:
        return "eo"
    return "oe"


def _split_word_syllables(word):
    """Split a Yale word into a list of raw syllable strings. Both apostrophe
    `'` and whitespace are honored as explicit syllable-break hints inside
    the word."""
    # Normalize apostrophes to spaces, then split on any whitespace.
    pieces = [p for p in word.replace("'", " ").split() if p]
    syllables = []
    for piece in pieces:
        syllables.extend(_split_piece(piece))
    return syllables


def _split_piece(piece):
    """Split a Yale string with no apostrophes into syllables."""
    nfd = unicodedata.normalize("NFD", piece)
    syllables = []
    i = 0
    n = len(nfd)
    while i < n:
        end = _find_syllable_end(nfd, i)
        syllables.append(nfd[i:end])
        i = end
    return syllables


def _find_syllable_end(s, start):
    """Find the end index (exclusive) of the syllable starting at s[start].

    Yale low-tone 'h' placement depends on coda type:
    - stop/nasal coda (p/t/k/m/n/ng): h comes BEFORE coda  -> nucleus + h + coda
    - glide coda (i/u/w):             h comes AFTER  coda  -> nucleus + coda + h
    - no coda:                        h comes at end       -> nucleus + h
    """
    i = start
    n = len(s)

    # ---- onset ----
    onset = ""
    for cand in _YALE_ONSETS_ORDERED:
        if s.startswith(cand, i):
            onset = cand
            break
    nucleus_start = i + len(onset)

    # Backtrack: 'm'/'ng' may be a syllabic nasal nucleus, not an onset.
    if nucleus_start >= n or s[nucleus_start] not in _YALE_VOWEL_LETTERS:
        if onset in ("m", "ng") and _looks_like_syllabic(s, i, onset):
            onset = ""
            nucleus_start = i
        elif onset == "" and i < n and s[i] in _YALE_VOWEL_LETTERS:
            pass  # vowel-initial syllable
        else:
            if onset == "":
                raise ValueError(
                    "cannot parse Yale syllable starting at "
                    + repr(unicodedata.normalize("NFC", s[i:]))
                )

    # Backtrack onset "y" if it's really the prefix of nucleus "yu" (Jyutping
    # "jyu"). The "yu" nucleus only combines with codas in {"", "n", "t"} in
    # real Cantonese, so only backtrack when the rest fits that shape.
    if onset == "y":
        test_end, test_raw = _consume_nucleus(s, i)
        test_base = "".join(c for c in test_raw if unicodedata.category(c) != "Mn")
        if test_base == "yu" and _yu_compatible_tail(s, test_end):
            onset = ""
            nucleus_start = i

    # ---- nucleus ----
    nuc_end, nuc_raw = _consume_nucleus(s, nucleus_start)
    if nuc_end == nucleus_start:
        raise ValueError(
            "cannot find nucleus in Yale syllable -- "
            + repr(unicodedata.normalize("NFC", s[i:]))
        )

    # Syllabic nasals (onset-less "m" and "ng") never take a coda in Yale/Jyutping.
    nuc_base = "".join(c for c in nuc_raw if unicodedata.category(c) != "Mn")
    is_syllabic_nasal = onset == "" and nuc_base in ("m", "ng")

    # ---- coda + h-marker, accounting for h placement ----
    pos = nuc_end

    if is_syllabic_nasal and (pos >= n or s[pos] != "h"):
        # No 'h' follows: definitely no coda (prevents greedily consuming the
        # next syllable's onset/nucleus as a coda, e.g. ng3+ng5 -> "ngnǵh").
        return pos

    if pos < n and s[pos] == "h":
        # h before stop/nasal coda (low-tone case), or h at syllable end
        after_h = pos + 1
        for cand in ("ng", "p", "t", "k", "m", "n"):
            if s.startswith(cand, after_h):
                return after_h + len(cand)  # h + stop/nasal coda
        # No stop/nasal coda after h: h is either a low-tone marker (end of
        # syllable) or the onset of the next syllable (followed by a vowel).
        if after_h >= n or s[after_h] not in _YALE_VOWEL_LETTERS:
            return after_h  # low-tone h, no coda
        return pos  # h is the next syllable's onset
    else:
        # Try glide coda (h follows the coda for low tones)
        for cand in ("i", "u", "w"):
            if s.startswith(cand, pos):
                coda_end = pos + len(cand)
                if coda_end < n and s[coda_end] == "h":
                    after_h = coda_end + 1
                    if after_h >= n or s[after_h] not in _YALE_VOWEL_LETTERS:
                        return after_h  # glide coda + low-tone h
                return coda_end  # glide coda, no h
        # Try stop/nasal coda without h (tones 1-3)
        for cand in ("ng", "p", "t", "k", "m", "n"):
            if s.startswith(cand, pos):
                return pos + len(cand)
        return pos  # no coda


def _yu_compatible_tail(s, pos):
    """True iff s[pos:] is a possible tail after a Yale "yu" nucleus.

    The Jyutping "yu" nucleus only combines with codas in {"", "n", "t"}.
    May be preceded by 'h' (low-tone marker) for codas "n"/"t" (h before
    stop/nasal) or alone (no coda)."""
    n = len(s)
    if pos >= n:
        return True  # bare "yu" nucleus
    ch = s[pos]
    if ch == "h":
        after = pos + 1
        if after >= n:
            return True  # low-tone, no coda
        if s[after] == "t":
            return True
        if s[after] == "n":
            return after + 1 >= n or s[after + 1] != "g"
        return False
    if ch == "t":
        return True
    if ch == "n":
        return pos + 1 >= n or s[pos + 1] != "g"
    return False


def _looks_like_syllabic(s, start, onset):
    """Return True if the onset 'm' or 'ng' at s[start:] is really a syllabic
    nasal nucleus (e.g., 'm̀h' or 'ǹgh'). True when the next char after the
    onset letters is either end-of-string, a combining diacritic, an 'h', or
    an apostrophe; i.e., not another vowel/consonant that would form a real
    onset+rime."""
    end = start + len(onset)
    if end >= len(s):
        return True
    nxt = s[end]
    if unicodedata.category(nxt) == "Mn":
        return True
    if nxt == "h":
        return True
    if nxt in _YALE_VOWEL_LETTERS:
        return False  # m + vowel = onset + nucleus
    # Another consonant means this is the boundary; treat as syllabic.
    return True


def _consume_nucleus(s, start):
    """Consume nucleus characters (vowels/syllabic-nasal base letters with at
    most one combining diacritic on the first base letter) starting at
    s[start]. Returns (end_index, base_nucleus_string_with_diacritic_attached).

    The base_nucleus_string returned includes the combining diacritic
    (preserving NFD form) so the caller can extract tone via _strip_diacritic.
    """
    n = len(s)
    if start >= n:
        return start, ""

    # Try multi-letter nuclei first (only against base letters, ignoring marks).
    base_seq = []
    spans = []  # parallel: end index after each base letter (including its mark)
    j = start
    while j < n and len(base_seq) < 2:
        ch = s[j]
        if unicodedata.category(ch) == "Mn":
            # standalone mark with no preceding base — error
            if not base_seq:
                raise ValueError("orphan combining mark at start of nucleus")
            j += 1
            continue
        if ch not in _YALE_VOWEL_LETTERS and ch not in ("m", "n", "g", "y"):
            break
        base_seq.append(ch)
        k = j + 1
        while k < n and unicodedata.category(s[k]) == "Mn":
            k += 1
        spans.append(k)
        j = k

    if not base_seq:
        return start, ""

    base_str = "".join(base_seq)

    # Choose the longest matching nucleus from the longest-first list.
    for cand in _YALE_NUCLEI_ORDERED:
        L = len(cand)
        if base_str.startswith(cand):
            # Special: "ng" and "m" as nuclei are syllabic; only allowed as
            # nucleus when nothing follows that could be a vowel (caller
            # already enforces vowel-or-syllabic structure).
            end = spans[L - 1]
            return end, s[start:end]

    # No multi-letter match: only fall back to single-letter nuclei.
    if base_seq[0] in {"a", "e", "i", "o", "u", "m"}:
        end = spans[0]
        return end, s[start:end]
    # 'y', 'n', 'g' alone are not valid nuclei.
    return start, ""


def _build_jyutping(onset_yale, nucleus_yale, coda_yale, tone):
    """Convert decomposed Yale pieces to a Jyutping syllable string."""
    if onset_yale not in _ONSETS_JYUTPING:
        raise ValueError("unknown Yale onset -- " + repr(onset_yale))
    if coda_yale not in _CODAS_JYUTPING and coda_yale != "w":
        raise ValueError("unknown Yale coda -- " + repr(coda_yale))

    onset_jp = _ONSETS_JYUTPING[onset_yale]

    # Convention: bare nucleus "yu" with no Yale onset corresponds to Jyutping
    # onset "j" + nucleus "yu" (real Cantonese has no /yu/ without a preceding
    # /j/; the Yale form "yū" is shared by Jyutping "jyu1" and "yu1").
    if onset_yale == "" and nucleus_yale == "yu":
        onset_jp = "j"

    # Yale coda "w" comes from Jyutping coda "u" with nucleus "oe" -> "ew".
    if coda_yale == "w":
        coda_jp = "u"
    else:
        coda_jp = _CODAS_JYUTPING[coda_yale]

    # Nucleus resolution.
    if nucleus_yale == "eu":
        nucleus_jp = _resolve_eu(coda_yale)
    elif nucleus_yale == "a" and coda_jp == "":
        # Yale "a" with no coda corresponds to Jyutping "aa".
        # But Yale also writes Jyutping "a" + coda as "a" + coda, so only
        # promote to "aa" when there's no coda.
        nucleus_jp = "aa"
    elif nucleus_yale in _NUCLEI_JYUTPING_UNAMBIGUOUS:
        nucleus_jp = _NUCLEI_JYUTPING_UNAMBIGUOUS[nucleus_yale]
    else:
        raise ValueError("unknown Yale nucleus -- " + repr(nucleus_yale))

    return f"{onset_jp}{nucleus_jp}{coda_jp}{tone}"


def _convert_syllable(raw):
    """Convert a single NFD-form Yale syllable into a Jyutping string."""
    n = len(raw)

    # ---- onset ----
    onset = ""
    for cand in _YALE_ONSETS_ORDERED:
        if raw.startswith(cand):
            onset = cand
            break
    nucleus_start = len(onset)

    # Backtrack: 'm'/'ng' may be a syllabic nasal nucleus.
    if nucleus_start >= n or raw[nucleus_start] not in _YALE_VOWEL_LETTERS:
        if onset in ("m", "ng") and _looks_like_syllabic(raw, 0, onset):
            onset = ""
            nucleus_start = 0
        elif onset == "" and raw and raw[0] in _YALE_VOWEL_LETTERS:
            pass
        else:
            if onset == "":
                raise ValueError(
                    "cannot parse Yale syllable -- "
                    + repr(unicodedata.normalize("NFC", raw))
                )

    # Backtrack onset "y" if it's really the prefix of nucleus "yu".
    if onset == "y":
        test_end, test_raw = _consume_nucleus(raw, 0)
        test_base = "".join(c for c in test_raw if unicodedata.category(c) != "Mn")
        if test_base == "yu" and _yu_compatible_tail(raw, test_end):
            onset = ""
            nucleus_start = 0

    # ---- nucleus ----
    nuc_end, nuc_chars = _consume_nucleus(raw, nucleus_start)
    if nuc_end == nucleus_start:
        raise ValueError(
            "cannot parse Yale syllable -- " + repr(unicodedata.normalize("NFC", raw))
        )
    base_nucleus, mark = _strip_diacritic(nuc_chars)

    # ---- coda + h-marker ----
    # Yale low-tone 'h' placement: BEFORE stop/nasal coda, AFTER glide coda.
    # Syllabic nasals (no onset) never take a coda.
    is_syllabic_nasal = onset == "" and base_nucleus in ("m", "ng")
    pos = nuc_end
    has_h = False
    coda = ""

    if is_syllabic_nasal and (pos >= n or raw[pos] != "h"):
        # No 'h' follows: no coda (e.g. ng3 or m3 standalone syllables).
        tone = _tone_from(mark, has_h)
        return _build_jyutping(onset, base_nucleus, coda, tone)

    if pos < n and raw[pos] == "h":
        after_h = pos + 1
        for cand in ("ng", "p", "t", "k", "m", "n"):
            if raw.startswith(cand, after_h):
                has_h = True
                coda = cand
                break
        else:
            has_h = True
            coda = ""
    else:
        for cand in ("i", "u", "w"):
            if raw.startswith(cand, pos):
                coda = cand
                coda_end = pos + len(cand)
                if coda_end < n and raw[coda_end] == "h":
                    has_h = True
                break
        else:
            for cand in ("ng", "p", "t", "k", "m", "n"):
                if raw.startswith(cand, pos):
                    coda = cand
                    break

    tone = _tone_from(mark, has_h)
    return _build_jyutping(onset, base_nucleus, coda, tone)



[docs]
def yale_to_jyutping(yale: str | list[str]) -> list[str]:
    """Convert Yale romanization into Jyutping romanization.

    The inverse of :func:`jyutping_to_yale`. Accepts Yale in the diacritic +
    ``h`` low-tone style (same form produced by ``jyutping_to_yale``).

    Args:
        yale (str or list[str]): A Yale romanization string for a single word,
            or a list of such strings carrying explicit word segmentation
            (one word per element). Inside a single-word string, both
            whitespace and apostrophes ``'`` are accepted as syllable-boundary
            hints; neither creates a word boundary. Pass a ``list[str]`` to
            mark word boundaries.

    Returns:
        list[str]: A list with one element per input word. Each element is
        the Jyutping representation of that word, with syllables separated
        by a single space.

    Raises:
        ValueError: If the Yale romanization is illegal (e.g., with
            unrecognized elements or a missing low-tone marker on a tone-4
            grave-accented syllable).

    Examples:
        >>> yale_to_jyutping("gwóngdūngwá")  # 廣東話, Cantonese
        ['gwong2 dung1 waa2']
        >>> yale_to_jyutping(["gāmyaht", "góng", "gwóngdūngwá"])
        ['gam1 jat6', 'gong2', 'gwong2 dung1 waa2']
    """
    if not yale:
        return []

    words = [yale] if isinstance(yale, str) else yale

    result = []
    for word in words:
        syllables = _split_word_syllables(word)
        jp_parts = [_convert_syllable(s) for s in syllables]
        result.append(" ".join(jp_parts))
    return result