Source code for pycantonese.jyutping.ipa

from __future__ import annotations

from functools import lru_cache

from .parse_jyutping import parse_jyutping

_ONSETS = {
    "b": "p",
    "d": "t",
    "g": "k",
    "gw": "kʷ",
    "z": "ts",
    "p": "pʰ",
    "t": "tʰ",
    "k": "kʰ",
    "kw": "kʷʰ",
    "c": "tsʰ",
    "m": "m",
    "n": "n",
    "ng": "ŋ",
    "f": "f",
    "h": "h",
    "s": "s",
    "l": "l",
    "w": "w",
    "j": "j",
    "": "",
}

_NUCLEI = {
    "aa": "aː",
    "a": "ɐ",
    "i": "i",  # ɪ before ng, k
    "yu": "y",
    "u": "u",  # ʊ before ng, k
    "oe": "œ",
    "e": "ɛ",  # e before i
    "eo": "ɵ",
    "o": "ɔ",  # o before u
    "m": "m",
    "n": "n",
    "ng": "ŋ",
}

_CODAS = {
    "p": "p̚",
    "t": "t̚",
    "k": "k̚",
    "m": "m",
    "n": "n",
    "ng": "ŋ",
    "i": "i",  # y after eo, u, o
    "u": "u",
    "": "",
}

_TONES = {
    "1": "55",
    "2": "25",
    "3": "33",
    "4": "21",
    "5": "23",
    "6": "22",
}


@lru_cache
def _replace(current, parsed, part_to_match, matches, default):
    if getattr(parsed, part_to_match) in matches:
        return default
    else:
        return current



[docs]
def jyutping_to_ipa(
    jp: str | list[str],
    *,
    onsets: dict[str, str] | None = None,
    nuclei: dict[str, str] | None = None,
    codas: dict[str, str] | None = None,
    tones: dict[str, str] | None = None,
) -> list[str]:
    """Convert Jyutping romanization into IPA.

    The Jyutping-to-IPA mapping is based on Matthews and Yip (2011: 461-463).

    Args:
        jp (str or list[str]): A Jyutping romanization string for a single
            word (any number of syllables, optionally separated by spaces),
            or a list of such strings carrying explicit word segmentation
            (one word per element).
        onsets (dict[str, str], optional): If provided, it must be a
            dictionary that maps Jyutping onsets to the desired IPA symbols
            for customization. For example, Jyutping "z" maps to IPA /ts/ by
            default. Passing in ``{"z": "tʃ"}`` would map "z" to /tʃ/
            instead.
        nuclei (dict[str, str], optional): If provided, it must be a
            dictionary that maps Jyutping nuclei to the desired IPA symbols
            for customization. For example, Jyutping "i" maps to IPA /i/ by
            default. Passing in ``{"i": "iː"}`` would map "i" to /iː/
            instead.
        codas (dict[str, str], optional): If provided, it must be a
            dictionary that maps Jyutping codas to the desired IPA symbols
            for customization. For example, Jyutping "p" maps to IPA /p̚/ by
            default. Passing in ``{"p": "p"}`` would map "p" to /p/ instead.
        tones (dict[str, str], optional): If provided, it must be a
            dictionary that maps Jyutping tones to the desired IPA symbols
            for customization. For example, Jyutping "2" (high-rising tone)
            maps to IPA /25/ by default. Passing in ``{"2": "35"}`` would map
            Jyutping "2" to /35/ instead.

    Returns:
        list[str]: A list with one element per input word. Each element is
        the IPA representation of that word, with syllables separated by a
        single space.

    Examples:
        >>> jyutping_to_ipa('gwong2dung1waa2')  # 廣東話 Cantonese
        ['kʷɔŋ25 tʊŋ55 waː25']
        >>> jyutping_to_ipa(['gwong2dung1', 'waa2'])
        ['kʷɔŋ25 tʊŋ55', 'waː25']
        >>> jyutping_to_ipa('ci1', onsets={'c': "tʃ'"})
        ["tʃ'i55"]
        >>> jyutping_to_ipa('ci1', tones={'1': "˥"})
        ['tsʰi˥']

    See Also:
        :func:`~pycantonese.g2p`: One-shot grapheme-to-phoneme conversion that
        composes :func:`~pycantonese.characters_to_jyutping` with this function.
    """
    if not jp:
        return []
    words = [jp] if isinstance(jp, str) else jp
    return [
        " ".join(_word_to_ipa_syllables(word, onsets, nuclei, codas, tones))
        for word in words
    ]



def _word_to_ipa_syllables(
    word: str,
    onsets: dict[str, str] | None,
    nuclei: dict[str, str] | None,
    codas: dict[str, str] | None,
    tones: dict[str, str] | None,
) -> list[str]:
    jp_parsed_list = parse_jyutping(word)
    ipa_list = []

    for jp_parsed in jp_parsed_list:
        onset = _ONSETS[jp_parsed.onset]
        nucleus = _NUCLEI[jp_parsed.nucleus]
        coda = _CODAS[jp_parsed.coda]
        tone = _TONES[jp_parsed.tone]

        if (n := jp_parsed.nucleus) == "i":
            nucleus = _replace(nucleus, jp_parsed, "coda", ("ng", "k"), "ɪ")
        elif n == "u":
            nucleus = _replace(nucleus, jp_parsed, "coda", ("ng", "k"), "ʊ")
        elif n == "e":
            nucleus = _replace(nucleus, jp_parsed, "coda", ("i",), "e")
        elif n == "o":
            nucleus = _replace(nucleus, jp_parsed, "coda", ("u",), "o")

        if jp_parsed.coda == "i":
            coda = _replace(coda, jp_parsed, "nucleus", ("eo", "u", "o"), "y")

        onset = (onsets or {}).get(jp_parsed.onset, onset)
        nucleus = (nuclei or {}).get(jp_parsed.nucleus, nucleus)
        coda = (codas or {}).get(jp_parsed.coda, coda)
        tone = (tones or {}).get(jp_parsed.tone, tone)

        ipa_list.append(onset + nucleus + coda + tone)

    return ipa_list