from __future__ import annotations
from .characters import characters_to_jyutping
from .ipa import jyutping_to_ipa
[docs]
def g2p(
chars: str | list[str],
*,
onsets: dict[str, str] | None = None,
nuclei: dict[str, str] | None = None,
codas: dict[str, str] | None = None,
tones: dict[str, str] | None = None,
) -> list[tuple[str, str | None]]:
"""Convert Cantonese characters into IPA (grapheme-to-phoneme).
This is a one-shot grapheme-to-phoneme (G2P) helper that composes
:func:`~pycantonese.characters_to_jyutping` and
:func:`~pycantonese.jyutping_to_ipa`. The input is segmented into words
(using :func:`~pycantonese.segment` if a raw string is passed), each word
is mapped to Jyutping, and each Jyutping syllable is then mapped to IPA.
Args:
chars (str or list[str]): A string of Cantonese characters, in which
case word segmentation is also run on this input string
(by :func:`~pycantonese.segment`) in order to resolve potential
ambiguity in mapping characters to Jyutping. If you don't want
word segmentation to be done, then provide a list of strings
instead with your desired segmentation.
onsets (dict[str, str], optional): Custom Jyutping-onset to IPA-symbol
overrides, forwarded to :func:`~pycantonese.jyutping_to_ipa`.
nuclei (dict[str, str], optional): Custom Jyutping-nucleus to IPA-symbol
overrides, forwarded to :func:`~pycantonese.jyutping_to_ipa`.
codas (dict[str, str], optional): Custom Jyutping-coda to IPA-symbol
overrides, forwarded to :func:`~pycantonese.jyutping_to_ipa`.
tones (dict[str, str], optional): Custom Jyutping-tone to IPA-symbol
overrides, forwarded to :func:`~pycantonese.jyutping_to_ipa`.
Returns:
list[tuple[str, str | None]]: A list of segmented words, where each
word is a 2-tuple of (Cantonese characters, IPA string). Within the
IPA string, syllables are separated by a single space. Any word with
no Jyutping mapping (e.g. an unseen character or a punctuation mark)
yields ``None`` in place of the IPA string.
Examples:
>>> g2p("香港人講廣東話。") # Hongkongers speak Cantonese.
[('香港人', 'hœŋ55 kɔŋ25 jɐn21'), ('講', 'kɔŋ25'), ('廣東話', 'kʷɔŋ25 tʊŋ55 waː25'), ('。', None)]
See Also:
:func:`~pycantonese.characters_to_jyutping`,
:func:`~pycantonese.jyutping_to_ipa`.
""" # noqa: E501
result: list[tuple[str, str | None]] = []
for word, jp in characters_to_jyutping(chars):
if jp is None:
result.append((word, None))
else:
[ipa] = jyutping_to_ipa(
jp,
onsets=onsets,
nuclei=nuclei,
codas=codas,
tones=tones,
)
result.append((word, ipa))
return result