Source code for pycantonese.jyutping.characters

from __future__ import annotations

from collections import Counter, defaultdict
from functools import lru_cache

from ..corpus import hkcancor, Token
from ..data.rime_cantonese import CHARS_TO_JYUTPING
from ..jyutping.parse_jyutping import parse_jyutping
from ..word_segmentation.segmenter import segment


@lru_cache(maxsize=1)
def _get_words_characters_to_jyutping():
    corpus = hkcancor()
    words_to_jyutping_counters = defaultdict(Counter)
    characters_to_jyutping_counters = defaultdict(Counter)

    for token in corpus.tokens():
        token: Token
        word = token.word
        jyutping = token.jyutping

        if not jyutping or not word:
            continue
        try:
            parsed_jp = parse_jyutping(jyutping)
        except ValueError:
            continue
        if len(word) != len(parsed_jp):
            continue
        spaced = " ".join(str(jp) for jp in parsed_jp)
        words_to_jyutping_counters[word][spaced] += 1
        for char, jp in zip(word, parsed_jp):
            characters_to_jyutping_counters[char][str(jp)] += 1

    words_to_jyutping = {}
    for word, jyutping_counter in words_to_jyutping_counters.items():
        jp = jyutping_counter.most_common(1)[0][0]
        words_to_jyutping[word] = jp
    chars_to_jp = {}
    for character, jyutping_counter in characters_to_jyutping_counters.items():
        jp = jyutping_counter.most_common(1)[0][0]
        chars_to_jp[character] = jp

    words_to_jyutping = {
        # The ordering of the following dicts matters.
        # rime-cantonese (more accurate data) overrides HKCanCor if they don't agree.
        **words_to_jyutping,
        **CHARS_TO_JYUTPING,
    }

    chars_to_jp = {
        # The ordering of the following dicts matters.
        # rime-cantonese (more accurate data) overrides HKCanCor if they don't agree.
        **chars_to_jp,
        **{k: v for k, v in CHARS_TO_JYUTPING.items() if len(k) == 1},
    }

    return words_to_jyutping, chars_to_jp



[docs]
def characters_to_jyutping(
    chars: str | list[str],
) -> list[tuple[str, str | None]]:
    """Convert Cantonese characters into Jyutping romanization.

    The conversion model is based on the HKCanCor corpus and rime-cantonese
    data. Any unseen Cantonese character (or punctuation mark,
    for that matter) is represented by ``None`` in the output.

    Args:
        chars (str or list[str]): A string of Cantonese characters, in which
            case word segmentation is also run on this input string
            (by :func:`~pycantonese.segment`) in order to resolve potential
            ambiguity in mapping characters to Jyutping. If you don't want
            word segmentation to be done, then provide a list of strings
            instead with your desired segmentation.

    Returns:
        list[tuple[str, str | None]]: A list of segmented words, where each
        word is a 2-tuple of (Cantonese characters, Jyutping romanization).
        Within the Jyutping string, syllables are separated by a single space.

    Examples:
        >>> characters_to_jyutping("香港人講廣東話。")  # Hongkongers speak Cantonese.
        [('香港人', 'hoeng1 gong2 jan4'), ('講', 'gong2'), ('廣東話', 'gwong2 dung1 waa2'), ('。', None)]

    See Also:
        :func:`~pycantonese.g2p`: One-shot grapheme-to-phoneme conversion that
        composes this function with :func:`~pycantonese.jyutping_to_ipa`.
    """  # noqa: E501
    if not chars:
        return []
    if isinstance(chars, list):
        segmented = chars
    else:
        segmented = segment(chars)
    words_to_jyutping, chars_to_jyutping = _get_words_characters_to_jyutping()
    result: list[tuple[str, str | None]] = []
    for word in segmented:
        try:
            jp: str | None = words_to_jyutping[word]
        except KeyError:
            parts: list[str] = []
            jp = None
            for char in word:
                try:
                    parts.append(chars_to_jyutping[char])
                except KeyError:
                    parts = []
                    break
            else:
                jp = " ".join(parts)
        result.append((word, jp))
    return result